[llvm] a8d9d50 - [AMDGPU] gfx90a support

Stanislav Mekhanoshin via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 17 16:02:38 PST 2021


Author: Stanislav Mekhanoshin
Date: 2021-02-17T16:01:32-08:00
New Revision: a8d9d50762c42d726274d3f1126ec97ff96e2a22

URL: https://github.com/llvm/llvm-project/commit/a8d9d50762c42d726274d3f1126ec97ff96e2a22
DIFF: https://github.com/llvm/llvm-project/commit/a8d9d50762c42d726274d3f1126ec97ff96e2a22.diff

LOG: [AMDGPU] gfx90a support

Differential Revision: https://reviews.llvm.org/D96906

Added: 
    clang/test/SemaOpenCL/builtins-amdgcn-error-gfx90a-param.cl
    llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx90a.mir
    llvm/test/CodeGen/AMDGPU/acc-ldst.ll
    llvm/test/CodeGen/AMDGPU/adjust-writemask-vectorized.ll
    llvm/test/CodeGen/AMDGPU/agpr-csr.ll
    llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
    llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll
    llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir
    llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
    llvm/test/CodeGen/AMDGPU/dpp64_combine.mir
    llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
    llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.gfx90a.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
    llvm/test/CodeGen/AMDGPU/llvm.pow-gfx9.ll
    llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
    llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir
    llvm/test/CodeGen/AMDGPU/packed-fp32.ll
    llvm/test/CodeGen/AMDGPU/reserved-reg-in-clause.mir
    llvm/test/CodeGen/AMDGPU/reserved-vgpr-tuples.mir
    llvm/test/CodeGen/AMDGPU/tgsplit.ll
    llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir
    llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
    llvm/test/CodeGen/AMDGPU/waitcnt-agpr.mir
    llvm/test/CodeGen/MIR/AMDGPU/load-store-opt-scc.mir
    llvm/test/MC/AMDGPU/dpp64.s
    llvm/test/MC/AMDGPU/gfx90a_asm_features.s
    llvm/test/MC/AMDGPU/gfx90a_err.s
    llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
    llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s
    llvm/test/MC/AMDGPU/mai-gfx90a.s
    llvm/test/MC/AMDGPU/mimg-gfx90a.s
    llvm/test/MC/AMDGPU/misaligned-vgpr-tuples-err.s
    llvm/test/MC/Disassembler/AMDGPU/dpp64.txt
    llvm/test/MC/Disassembler/AMDGPU/gfx90a_dasm_features.txt
    llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt
    llvm/test/MC/Disassembler/AMDGPU/mai-gfx90a.txt
    llvm/test/MC/Disassembler/AMDGPU/mimg_gfx90a.txt
    llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f32.ll

Modified: 
    clang/docs/ClangCommandLineReference.rst
    clang/include/clang/Basic/BuiltinsAMDGPU.def
    clang/include/clang/Basic/Cuda.h
    clang/include/clang/Driver/Options.td
    clang/lib/Basic/Cuda.cpp
    clang/lib/Basic/Targets/AMDGPU.cpp
    clang/lib/Basic/Targets/NVPTX.cpp
    clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
    clang/test/CodeGenOpenCL/amdgpu-features.cl
    clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
    clang/test/Driver/amdgpu-features.c
    clang/test/Driver/amdgpu-macros.cl
    clang/test/Driver/amdgpu-mcpu.cl
    clang/test/Driver/cuda-bad-arch.cu
    clang/test/Driver/hip-toolchain-features.hip
    clang/test/Misc/target-invalid-cpu-note.c
    llvm/docs/AMDGPUUsage.rst
    llvm/include/llvm/BinaryFormat/ELF.h
    llvm/include/llvm/IR/IntrinsicsAMDGPU.td
    llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
    llvm/include/llvm/Support/TargetParser.h
    llvm/lib/Object/ELFObjectFile.cpp
    llvm/lib/ObjectYAML/ELFYAML.cpp
    llvm/lib/Support/TargetParser.cpp
    llvm/lib/Target/AMDGPU/AMDGPU.td
    llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
    llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
    llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
    llvm/lib/Target/AMDGPU/AMDGPUGISel.td
    llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
    llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
    llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
    llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
    llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
    llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
    llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
    llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
    llvm/lib/Target/AMDGPU/BUFInstructions.td
    llvm/lib/Target/AMDGPU/DSInstructions.td
    llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
    llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
    llvm/lib/Target/AMDGPU/FLATInstructions.td
    llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
    llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
    llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
    llvm/lib/Target/AMDGPU/GCNProcessors.td
    llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
    llvm/lib/Target/AMDGPU/GCNRegPressure.h
    llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
    llvm/lib/Target/AMDGPU/GCNSubtarget.h
    llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
    llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
    llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
    llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
    llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
    llvm/lib/Target/AMDGPU/MIMGInstructions.td
    llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
    llvm/lib/Target/AMDGPU/SIDefines.h
    llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
    llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
    llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
    llvm/lib/Target/AMDGPU/SIInstrFormats.td
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.h
    llvm/lib/Target/AMDGPU/SIInstrInfo.td
    llvm/lib/Target/AMDGPU/SIInstructions.td
    llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
    llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
    llvm/lib/Target/AMDGPU/SIProgramInfo.h
    llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
    llvm/lib/Target/AMDGPU/SIRegisterInfo.h
    llvm/lib/Target/AMDGPU/SIRegisterInfo.td
    llvm/lib/Target/AMDGPU/SISchedule.td
    llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
    llvm/lib/Target/AMDGPU/VOP1Instructions.td
    llvm/lib/Target/AMDGPU/VOP2Instructions.td
    llvm/lib/Target/AMDGPU/VOP3Instructions.td
    llvm/lib/Target/AMDGPU/VOP3PInstructions.td
    llvm/lib/Target/AMDGPU/VOPInstructions.td
    llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
    llvm/test/Analysis/CostModel/AMDGPU/fma.ll
    llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-implicit-def.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.s96.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.s96.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll
    llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.mir
    llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
    llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
    llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
    llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir
    llvm/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir
    llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir
    llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
    llvm/test/CodeGen/AMDGPU/bundle-latency.mir
    llvm/test/CodeGen/AMDGPU/clamp-omod-special-case.mir
    llvm/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir
    llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir
    llvm/test/CodeGen/AMDGPU/coalescer-extend-pruned-subrange.mir
    llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir
    llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir
    llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir
    llvm/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir
    llvm/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir
    llvm/test/CodeGen/AMDGPU/collapse-endcf2.mir
    llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
    llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir
    llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir
    llvm/test/CodeGen/AMDGPU/dead-lane.mir
    llvm/test/CodeGen/AMDGPU/dead_copy.mir
    llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
    llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
    llvm/test/CodeGen/AMDGPU/endpgm-dce.mir
    llvm/test/CodeGen/AMDGPU/expand-si-indirect.mir
    llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
    llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir
    llvm/test/CodeGen/AMDGPU/flat-load-clustering.mir
    llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi.mir
    llvm/test/CodeGen/AMDGPU/fma.f64.ll
    llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir
    llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir
    llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir
    llvm/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir
    llvm/test/CodeGen/AMDGPU/fold-multiple.mir
    llvm/test/CodeGen/AMDGPU/fp-atomic-to-s_denormmode.mir
    llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
    llvm/test/CodeGen/AMDGPU/hard-clauses.mir
    llvm/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir
    llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir
    llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir
    llvm/test/CodeGen/AMDGPU/hazard-inlineasm.mir
    llvm/test/CodeGen/AMDGPU/hazard-pass-ordering.mir
    llvm/test/CodeGen/AMDGPU/hazard-recognizer-meta-insts.mir
    llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
    llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
    llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir
    llvm/test/CodeGen/AMDGPU/insert-waitcnts-exp.mir
    llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir
    llvm/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir
    llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir
    llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
    llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir
    llvm/test/CodeGen/AMDGPU/mai-hazards.mir
    llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
    llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir
    llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir
    llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir
    llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir
    llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir
    llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir
    llvm/test/CodeGen/AMDGPU/memory_clause.mir
    llvm/test/CodeGen/AMDGPU/merge-image-load-gfx10.mir
    llvm/test/CodeGen/AMDGPU/merge-image-load.mir
    llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx10.mir
    llvm/test/CodeGen/AMDGPU/merge-image-sample.mir
    llvm/test/CodeGen/AMDGPU/merge-load-store.mir
    llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
    llvm/test/CodeGen/AMDGPU/mfma-loop.ll
    llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
    llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir
    llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
    llvm/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir
    llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir
    llvm/test/CodeGen/AMDGPU/pei-build-spill.mir
    llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir
    llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
    llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir
    llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
    llvm/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir
    llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx10.mir
    llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir
    llvm/test/CodeGen/AMDGPU/regbank-reassign.mir
    llvm/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir
    llvm/test/CodeGen/AMDGPU/regcoalesce-dbg.mir
    llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir
    llvm/test/CodeGen/AMDGPU/s_code_end.ll
    llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
    llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
    llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
    llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
    llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir
    llvm/test/CodeGen/AMDGPU/schedule-barrier.mir
    llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir
    llvm/test/CodeGen/AMDGPU/sdwa-ops.mir
    llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir
    llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
    llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
    llvm/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir
    llvm/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir
    llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir
    llvm/test/CodeGen/AMDGPU/shrink-carry.mir
    llvm/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
    llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir
    llvm/test/CodeGen/AMDGPU/spill-agpr-partially-undef.mir
    llvm/test/CodeGen/AMDGPU/spill-agpr.ll
    llvm/test/CodeGen/AMDGPU/spill-agpr.mir
    llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir
    llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
    llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
    llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
    llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir
    llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir
    llvm/test/CodeGen/AMDGPU/subvector-test.mir
    llvm/test/CodeGen/AMDGPU/syncscopes.ll
    llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
    llvm/test/CodeGen/AMDGPU/undefined-physreg-sgpr-spill.mir
    llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
    llvm/test/CodeGen/AMDGPU/vgpr-spill.mir
    llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir
    llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir
    llvm/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir
    llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir
    llvm/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir
    llvm/test/CodeGen/AMDGPU/waitcnt-loop-single-basic-block.mir
    llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir
    llvm/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir
    llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir
    llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir
    llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir
    llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.mir
    llvm/test/CodeGen/AMDGPU/waitcnt.mir
    llvm/test/CodeGen/AMDGPU/wqm.mir
    llvm/test/CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll
    llvm/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir
    llvm/test/CodeGen/MIR/AMDGPU/mir-canon-multi.mir
    llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir
    llvm/test/CodeGen/MIR/AMDGPU/syncscopes.mir
    llvm/test/CodeGen/MIR/AMDGPU/target-index-operands.mir
    llvm/test/MC/AMDGPU/atomic-fadd-insts.s
    llvm/test/MC/AMDGPU/hsa-diag-v3.s
    llvm/test/MC/AMDGPU/vop_dpp.s
    llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
    llvm/test/tools/llvm-readobj/ELF/amdgpu-elf-headers.test
    llvm/tools/llvm-readobj/ELFDumper.cpp

Removed: 
    


################################################################################
diff  --git a/clang/docs/ClangCommandLineReference.rst b/clang/docs/ClangCommandLineReference.rst
index 0ec99315a130..f7ac97484d91 100644
--- a/clang/docs/ClangCommandLineReference.rst
+++ b/clang/docs/ClangCommandLineReference.rst
@@ -2982,6 +2982,10 @@ Specify CU (-mcumode) or WGP (-mno-cumode) wavefront execution mode (AMDGPU only
 
 Specify SRAM ECC mode (AMDGPU only)
 
+.. option:: -mtgsplit, -mno-tgsplit
+
+Enable threadgroup split execution mode (AMDGPU only)
+
 .. option:: -mxnack, -mno-xnack
 
 Specify XNACK mode (AMDGPU only)

diff  --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 123a7ad212da..3544abe35896 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -258,5 +258,13 @@ TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_4x4x2bf16, "V4fV2sV2sV4fIiIiIi", "nc",
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x4bf16, "V16fV2sV2sV16fIiIiIi", "nc", "mai-insts")
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x8bf16, "V4fV2sV2sV4fIiIiIi", "nc", "mai-insts")
 
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x4bf16_1k, "V32fV4sV4sV32fIiIiIi", "nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x4bf16_1k, "V16fV4sV4sV16fIiIiIi", "nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_4x4x4bf16_1k, "V4fV4sV4sV4fIiIiIi", "nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x8bf16_1k, "V16fV4sV4sV16fIiIiIi", "nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x16bf16_1k, "V4fV4sV4sV4fIiIiIi", "nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f64_16x16x4f64, "V4dddV4dIiIiIi", "nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f64_4x4x4f64, "ddddIiIiIi", "nc", "mai-insts")
+
 #undef BUILTIN
 #undef TARGET_BUILTIN

diff  --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index 12ffa3e04fb8..f1d69d6dce50 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -78,6 +78,7 @@ enum class CudaArch {
   GFX906,
   GFX908,
   GFX909,
+  GFX90a,
   GFX90c,
   GFX1010,
   GFX1011,

diff  --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 13d3dcd868ab..7312c1988efe 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3094,6 +3094,11 @@ def mcumode : Flag<["-"], "mcumode">, Group<m_amdgpu_Features_Group>,
   HelpText<"Specify CU (-mcumode) or WGP (-mno-cumode) wavefront execution mode (AMDGPU only)">;
 def mno_cumode : Flag<["-"], "mno-cumode">, Group<m_amdgpu_Features_Group>;
 
+def mtgsplit : Flag<["-"], "mtgsplit">, Group<m_amdgpu_Features_Group>,
+  HelpText<"Enable threadgroup split execution mode (AMDGPU only)">;
+def mno_tgsplit : Flag<["-"], "mno-tgsplit">, Group<m_amdgpu_Features_Group>,
+  HelpText<"Disable threadgroup split execution mode (AMDGPU only)">;
+
 def mwavefrontsize64 : Flag<["-"], "mwavefrontsize64">, Group<m_Group>,
   HelpText<"Specify wavefront size 64 mode (AMDGPU only)">;
 def mno_wavefrontsize64 : Flag<["-"], "mno-wavefrontsize64">, Group<m_Group>,

diff  --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index 22eea1fb9cf6..29c09915f631 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -98,6 +98,7 @@ CudaArchToStringMap arch_names[] = {
     GFX(906),  // gfx906
     GFX(908),  // gfx908
     GFX(909),  // gfx909
+    GFX(90a),  // gfx90a
     GFX(90c),  // gfx90c
     GFX(1010), // gfx1010
     GFX(1011), // gfx1011

diff  --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp
index e6450c9f9ff8..0f1211d6c409 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -212,6 +212,9 @@ bool AMDGPUTargetInfo::initFeatureMap(
       Features["s-memrealtime"] = true;
       Features["s-memtime-inst"] = true;
       break;
+    case GK_GFX90A:
+      Features["gfx90a-insts"] = true;
+      LLVM_FALLTHROUGH;
     case GK_GFX908:
       Features["dot3-insts"] = true;
       Features["dot4-insts"] = true;

diff  --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index da8a578fa557..25e065ce584e 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -202,6 +202,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
       case CudaArch::GFX906:
       case CudaArch::GFX908:
       case CudaArch::GFX909:
+      case CudaArch::GFX90a:
       case CudaArch::GFX90c:
       case CudaArch::GFX1010:
       case CudaArch::GFX1011:

diff  --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 75be9372716f..8e22f49aa119 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -4634,6 +4634,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
       case CudaArch::GFX906:
       case CudaArch::GFX908:
       case CudaArch::GFX909:
+      case CudaArch::GFX90a:
       case CudaArch::GFX90c:
       case CudaArch::GFX1010:
       case CudaArch::GFX1011:
@@ -4703,6 +4704,7 @@ static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
   case CudaArch::GFX906:
   case CudaArch::GFX908:
   case CudaArch::GFX909:
+  case CudaArch::GFX90a:
   case CudaArch::GFX90c:
   case CudaArch::GFX1010:
   case CudaArch::GFX1011:

diff  --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index a796d07cb2e5..f387c93bd6e0 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -23,6 +23,7 @@
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx906 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX906 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx908 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX908 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx909 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX909 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90a -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX90A %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90c -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX90C %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1010 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1010 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1011 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1011 %s
@@ -52,6 +53,7 @@
 // GFX906: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
 // GFX908: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime,+s-memtime-inst"
 // GFX909: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
+// GFX90A: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst"
 // GFX90C: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
 // GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
 // GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index c7db942b871a..19ac40fe4160 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -1,5 +1,6 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX908
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX90A
 
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 
@@ -19,143 +20,199 @@ typedef short  v32s  __attribute__((ext_vector_type(32)));
 typedef double v4d   __attribute__((ext_vector_type(4)));
 
 
-// CHECK-LABEL: @test_mfma_f32_32x32x1f32
-// CHECK: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0)
+#ifdef MFMA_GFX908_TESTS
+
+// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x1f32
+// CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_32x32x1f32(global v32f* out, float a, float b, v32f c)
 {
   *out = __builtin_amdgcn_mfma_f32_32x32x1f32(a, b, c, 0, 0, 0);
 }
 
-// CHECK-LABEL: @test_mfma_f32_16x16x1f32
-// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x1f32
+// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_16x16x1f32(global v16f* out, float a, float b, v16f c)
 {
   *out = __builtin_amdgcn_mfma_f32_16x16x1f32(a, b, c, 0, 0, 0);
 }
 
-// CHECK-LABEL: @test_mfma_f32_4x4x1f32
-// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908-LABEL: @test_mfma_f32_4x4x1f32
+// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_4x4x1f32(global v4f* out, float a, float b, v4f c)
 {
   *out = __builtin_amdgcn_mfma_f32_4x4x1f32(a, b, c, 0, 0, 0);
 }
 
-// CHECK-LABEL: @test_mfma_f32_32x32x2f32
-// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x2f32
+// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_32x32x2f32(global v16f* out, float a, float b, v16f c)
 {
   *out = __builtin_amdgcn_mfma_f32_32x32x2f32(a, b, c, 0, 0, 0);
 }
 
-// CHECK-LABEL: @test_mfma_f32_16x16x4f32
-// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x4f32
+// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_16x16x4f32(global v4f* out, float a, float b, v4f c)
 {
   *out = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, c, 0, 0, 0);
 }
 
-// CHECK-LABEL: @test_mfma_f32_32x32x4f16
-// CHECK: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %a, <4 x half> %b, <32 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x4f16
+// CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %a, <4 x half> %b, <32 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_32x32x4f16(global v32f* out, v4h a, v4h b, v32f c)
 {
   *out = __builtin_amdgcn_mfma_f32_32x32x4f16(a, b, c, 0, 0, 0);
 }
 
-// CHECK-LABEL: @test_mfma_f32_16x16x4f16
-// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x4f16
+// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_16x16x4f16(global v16f* out, v4h a, v4h b, v16f c)
 {
   *out = __builtin_amdgcn_mfma_f32_16x16x4f16(a, b, c, 0, 0, 0);
 }
 
-// CHECK-LABEL: @test_mfma_f32_4x4x4f16
-// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908-LABEL: @test_mfma_f32_4x4x4f16
+// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_4x4x4f16(global v4f* out, v4h a, v4h b, v4f c)
 {
   *out = __builtin_amdgcn_mfma_f32_4x4x4f16(a, b, c, 0, 0, 0);
 }
 
-// CHECK-LABEL: @test_mfma_f32_32x32x8f16
-// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x8f16
+// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_32x32x8f16(global v16f* out, v4h a, v4h b, v16f c)
 {
   *out = __builtin_amdgcn_mfma_f32_32x32x8f16(a, b, c, 0, 0, 0);
 }
 
-// CHECK-LABEL: @test_mfma_f32_16x16x16f16
-// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x16f16
+// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_16x16x16f16(global v4f* out, v4h a, v4h b, v4f c)
 {
   *out = __builtin_amdgcn_mfma_f32_16x16x16f16(a, b, c, 0, 0, 0);
 }
 
-// CHECK-LABEL: @test_mfma_i32_32x32x4i8
-// CHECK: call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 %a, i32 %b, <32 x i32> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908-LABEL: @test_mfma_i32_32x32x4i8
+// CHECK-GFX908: call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 %a, i32 %b, <32 x i32> %c, i32 0, i32 0, i32 0)
 void test_mfma_i32_32x32x4i8(global v32i* out, int a, int b, v32i c)
 {
   *out = __builtin_amdgcn_mfma_i32_32x32x4i8(a, b, c, 0, 0, 0);
 }
 
-// CHECK-LABEL: @test_mfma_i32_16x16x4i8
-// CHECK: call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908-LABEL: @test_mfma_i32_16x16x4i8
+// CHECK-GFX908: call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
 void test_mfma_i32_16x16x4i8(global v16i* out, int a, int b, v16i c)
 {
   *out = __builtin_amdgcn_mfma_i32_16x16x4i8(a, b, c, 0, 0, 0);
 }
 
-// CHECK-LABEL: @test_mfma_i32_4x4x4i8
-// CHECK: call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908-LABEL: @test_mfma_i32_4x4x4i8
+// CHECK-GFX908: call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
 void test_mfma_i32_4x4x4i8(global v4i* out, int a, int b, v4i c)
 {
   *out = __builtin_amdgcn_mfma_i32_4x4x4i8(a, b, c, 0, 0, 0);
 }
 
-// CHECK-LABEL: @test_mfma_i32_32x32x8i8
-// CHECK: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908-LABEL: @test_mfma_i32_32x32x8i8
+// CHECK-GFX908: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
 void test_mfma_i32_32x32x8i8(global v16i* out, int a, int b, v16i c)
 {
   *out = __builtin_amdgcn_mfma_i32_32x32x8i8(a, b, c, 0, 0, 0);
 }
 
-// CHECK-LABEL: @test_mfma_i32_16x16x16i8
-// CHECK: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908-LABEL: @test_mfma_i32_16x16x16i8
+// CHECK-GFX908: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
 void test_mfma_i32_16x16x16i8(global v4i* out, int a, int b, v4i c)
 {
   *out = __builtin_amdgcn_mfma_i32_16x16x16i8(a, b, c, 0, 0, 0);
 }
 
-// CHECK-LABEL: @test_mfma_f32_32x32x2bf16
-// CHECK: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x2bf16
+// CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_32x32x2bf16(global v32f* out, v2s a, v2s b, v32f c)
 {
   *out = __builtin_amdgcn_mfma_f32_32x32x2bf16(a, b, c, 0, 0, 0);
 }
 
-// CHECK-LABEL: @test_mfma_f32_16x16x2bf16
-// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x2bf16
+// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_16x16x2bf16(global v16f* out, v2s a, v2s b, v16f c)
 {
   *out = __builtin_amdgcn_mfma_f32_16x16x2bf16(a, b, c, 0, 0, 0);
 }
 
-// CHECK-LABEL: @test_mfma_f32_4x4x2bf16
-// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908-LABEL: @test_mfma_f32_4x4x2bf16
+// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_4x4x2bf16(global v4f* out, v2s a, v2s b, v4f c)
 {
   *out = __builtin_amdgcn_mfma_f32_4x4x2bf16(a, b, c, 0, 0, 0);
 }
 
-// CHECK-LABEL: @test_mfma_f32_32x32x4bf16
-// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x4bf16
+// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_32x32x4bf16(global v16f* out, v2s a, v2s b, v16f c)
 {
   *out = __builtin_amdgcn_mfma_f32_32x32x4bf16(a, b, c, 0, 0, 0);
 }
 
-// CHECK-LABEL: @test_mfma_f32_16x16x8bf16
-// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x8bf16
+// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_16x16x8bf16(global v4f* out, v2s a, v2s b, v4f c)
 {
   *out = __builtin_amdgcn_mfma_f32_16x16x8bf16(a, b, c, 0, 0, 0);
 }
 
+#endif // MFMA_GFX908_TESTS
+
+#ifdef MFMA_GFX90A_TESTS
+
+// CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x4bf16_1k
+// CHECK-GFX90A: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_32x32x4bf16_1k(global v32f* out, v4s a, v4s b, v32f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_32x32x4bf16_1k(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x4bf16_1k
+// CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_16x16x4bf16_1k(global v16f* out, v4s a, v4s b, v16f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_16x16x4bf16_1k(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX90A-LABEL: @test_mfma_f32_4x4x4bf16_1k
+// CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_4x4x4bf16_1k(global v4f* out, v4s a, v4s b, v4f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x8bf16_1k
+// CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_32x32x8bf16_1k(global v16f* out, v4s a, v4s b, v16f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x16bf16_1k
+// CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_16x16x16bf16_1k(global v4f* out, v4s a, v4s b, v4f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX90A-LABEL: @test_mfma_f64_16x16x4f64
+// CHECK-GFX90A: call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %c, i32 0, i32 0, i32 0)
+void test_mfma_f64_16x16x4f64(global v4d* out, double a, double b, v4d c)
+{
+  *out = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX90A-LABEL: @test_mfma_f64_4x4x4f64
+// CHECK-GFX90A: call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %c, i32 0, i32 0, i32 0)
+void test_mfma_f64_4x4x4f64(global double* out, double a, double b, double c)
+{
+  *out = __builtin_amdgcn_mfma_f64_4x4x4f64(a, b, c, 0, 0, 0);
+}
+
+#endif // MFMA_GFX90A_TESTS

diff  --git a/clang/test/Driver/amdgpu-features.c b/clang/test/Driver/amdgpu-features.c
index 42ae3f0889d1..98383cd53410 100644
--- a/clang/test/Driver/amdgpu-features.c
+++ b/clang/test/Driver/amdgpu-features.c
@@ -22,6 +22,11 @@
 // RUN: %clang -### -target amdgcn-amdhsa -mcpu=gfx908:sramecc- %s 2>&1 | FileCheck --check-prefix=NO-SRAM-ECC %s
 // NO-SRAM-ECC: "-target-feature" "-sramecc"
 
+// RUN: %clang -### -target amdgcn -mcpu=gfx90A -mtgsplit %s 2>&1 | FileCheck --check-prefix=TGSPLIT %s
+// RUN: %clang -### -target amdgcn -mcpu=gfx90A -mno-tgsplit %s 2>&1 | FileCheck --check-prefix=NO-TGSPLIT %s
+// TGSPLIT: "-target-feature" "+tgsplit"
+// NO-TGSPLIT: "-target-feature" "-tgsplit"
+
 // RUN: %clang -### -target amdgcn-amdpal -mcpu=gfx1010 -mwavefrontsize64 %s 2>&1 | FileCheck --check-prefix=WAVE64 %s
 // RUN: %clang -### -target amdgcn-amdpal -mcpu=gfx1010 -mno-wavefrontsize64 -mwavefrontsize64 %s 2>&1 | FileCheck --check-prefix=WAVE64 %s
 // WAVE64: "-target-feature" "+wavefrontsize64"

diff  --git a/clang/test/Driver/amdgpu-macros.cl b/clang/test/Driver/amdgpu-macros.cl
index d708e29e1acf..297e82f6c224 100644
--- a/clang/test/Driver/amdgpu-macros.cl
+++ b/clang/test/Driver/amdgpu-macros.cl
@@ -105,6 +105,7 @@
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx906 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx906
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx908 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx908
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx909 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx909
+// RUN: %clang -E -dM -target amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx90a
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx90c %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx90c
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1010
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1011 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1011

diff  --git a/clang/test/Driver/amdgpu-mcpu.cl b/clang/test/Driver/amdgpu-mcpu.cl
index 1a6641072f90..1dac9883a561 100644
--- a/clang/test/Driver/amdgpu-mcpu.cl
+++ b/clang/test/Driver/amdgpu-mcpu.cl
@@ -90,6 +90,7 @@
 // RUN: %clang -### -target amdgcn -mcpu=gfx906 %s 2>&1 | FileCheck --check-prefix=GFX906 %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx908 %s 2>&1 | FileCheck --check-prefix=GFX908 %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx909 %s 2>&1 | FileCheck --check-prefix=GFX909 %s
+// RUN: %clang -### -target amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefix=GFX90A %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx90c %s 2>&1 | FileCheck --check-prefix=GFX90C %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefix=GFX1010 %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx1011 %s 2>&1 | FileCheck --check-prefix=GFX1011 %s
@@ -120,6 +121,7 @@
 // GFX906:    "-target-cpu" "gfx906"
 // GFX908:    "-target-cpu" "gfx908"
 // GFX909:    "-target-cpu" "gfx909"
+// GFX90A:    "-target-cpu" "gfx90a"
 // GFX90C:    "-target-cpu" "gfx90c"
 // GFX1010:   "-target-cpu" "gfx1010"
 // GFX1011:   "-target-cpu" "gfx1011"

diff  --git a/clang/test/Driver/cuda-bad-arch.cu b/clang/test/Driver/cuda-bad-arch.cu
index 321d67e75fe8..22a7be1e7d83 100644
--- a/clang/test/Driver/cuda-bad-arch.cu
+++ b/clang/test/Driver/cuda-bad-arch.cu
@@ -25,6 +25,8 @@
 // RUN: | FileCheck -check-prefix OK %s
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=gfx908 -c %s 2>&1 \
 // RUN: | FileCheck -check-prefix OK %s
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=gfx90a -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix OK %s
 // RUN: %clang -### -target x86_64-linux-gnu -c %s 2>&1 \
 // RUN: | FileCheck -check-prefix OK %s
 

diff  --git a/clang/test/Driver/hip-toolchain-features.hip b/clang/test/Driver/hip-toolchain-features.hip
index 13da70fe4fc0..bdece1a76d1e 100644
--- a/clang/test/Driver/hip-toolchain-features.hip
+++ b/clang/test/Driver/hip-toolchain-features.hip
@@ -48,6 +48,18 @@
 // ALL3: {{.*}}clang{{.*}}"-target-feature" "+sramecc" "-target-feature" "+xnack"
 // NOALL3: {{.*}}clang{{.*}}"-target-feature" "-sramecc" "-target-feature" "-xnack"
 
+// RUN: %clang -### -target x86_64-linux-gnu -fgpu-rdc -nogpulib \
+// RUN:   --cuda-gpu-arch=gfx1010 %s \
+// RUN:   -mtgsplit  2>&1 | FileCheck %s -check-prefix=TGSPLIT
+// RUN: %clang -### -target x86_64-linux-gnu -fgpu-rdc -nogpulib \
+// RUN:   --cuda-gpu-arch=gfx1010 %s \
+// RUN:   -mno-tgsplit  2>&1 | FileCheck %s -check-prefix=NOTTGSPLIT
+
+// TGSPLIT: {{.*}}clang{{.*}}"-target-feature" "+tgsplit"
+// NOTTGSPLIT: {{.*}}clang{{.*}}"-target-feature" "-tgsplit"
+// TGSPLIT: {{.*}}lld{{.*}} "-plugin-opt=-mattr=+tgsplit"
+// NOTTGSPLIT: {{.*}}lld{{.*}} "-plugin-opt=-mattr=-tgsplit"
+
 // RUN: %clang -### -target x86_64-linux-gnu -fgpu-rdc -nogpulib \
 // RUN:   --cuda-gpu-arch=gfx1010 %s \
 // RUN:   -mcumode -mcumode -mno-cumode -mwavefrontsize64 -mcumode \

diff  --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index c61f17986ec5..43ccdcc4a3cb 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -86,7 +86,7 @@
 // AMDGCN-SAME: gfx703, kabini, mullins, gfx704, bonaire, gfx705, gfx801, carrizo,
 // AMDGCN-SAME: gfx802, iceland, tonga, gfx803, fiji, polaris10, polaris11,
 // AMDGCN-SAME: gfx805, tongapro, gfx810, stoney, gfx900, gfx902, gfx904, gfx906,
-// AMDGCN-SAME: gfx908, gfx909, gfx90c, gfx1010, gfx1011, gfx1012, gfx1030, gfx1031,
+// AMDGCN-SAME: gfx908, gfx909, gfx90a, gfx90c, gfx1010, gfx1011, gfx1012, gfx1030, gfx1031,
 // AMDGCN-SAME: gfx1032, gfx1033
 
 // RUN: not %clang_cc1 -triple wasm64--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix WEBASM

diff  --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx90a-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx90a-param.cl
new file mode 100644
index 000000000000..701016148a89
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx90a-param.cl
@@ -0,0 +1,67 @@
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx90a -verify -S -o - %s
+
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+
+typedef float  v4f   __attribute__((ext_vector_type(4)));
+typedef float  v16f  __attribute__((ext_vector_type(16)));
+typedef float  v32f  __attribute__((ext_vector_type(32)));
+typedef half   v4h   __attribute__((ext_vector_type(4)));
+typedef half   v16h  __attribute__((ext_vector_type(16)));
+typedef half   v32h  __attribute__((ext_vector_type(32)));
+typedef int    v4i   __attribute__((ext_vector_type(4)));
+typedef int    v16i  __attribute__((ext_vector_type(16)));
+typedef int    v32i  __attribute__((ext_vector_type(32)));
+typedef short  v2s   __attribute__((ext_vector_type(2)));
+typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef short  v16s  __attribute__((ext_vector_type(16)));
+typedef short  v32s  __attribute__((ext_vector_type(32)));
+typedef double v4d   __attribute__((ext_vector_type(4)));
+
+void test_mfma_f32_32x32x4bf16_1k(global v32f* out, v4s a, v4s b, v32f c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f32_32x32x4bf16_1k(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4bf16_1k' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x4bf16_1k(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4bf16_1k' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x4bf16_1k(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4bf16_1k' must be a constant integer}}
+}
+
+void test_mfma_f32_16x16x4bf16_1k(global v16f* out, v4s a, v4s b, v16f c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f32_16x16x4bf16_1k(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x4bf16_1k' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x4bf16_1k(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x4bf16_1k' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x4bf16_1k(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x4bf16_1k' must be a constant integer}}
+}
+
+void test_mfma_f32_4x4x4bf16_1k(global v4f* out, v4s a, v4s b, v4f c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_4x4x4bf16_1k' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_4x4x4bf16_1k' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_4x4x4bf16_1k' must be a constant integer}}
+}
+
+void test_mfma_f32_32x32x8bf16_1k(global v16f* out, v4s a, v4s b, v16f c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x8bf16_1k' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x8bf16_1k' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x8bf16_1k' must be a constant integer}}
+}
+
+void test_mfma_f32_16x16x16bf16_1k(global v4f* out, v4s a, v4s b, v4f c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x16bf16_1k' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x16bf16_1k' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x16bf16_1k' must be a constant integer}}
+}
+
+void test_mfma_f64_16x16x4f64(global v4d* out, double a, double b, v4d c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f64_16x16x4f64' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f64_16x16x4f64' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f64_16x16x4f64' must be a constant integer}}
+}
+
+void test_mfma_f64_4x4x4f64(global double* out, double a, double b, double c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f64_4x4x4f64(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f64_4x4x4f64' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f64_4x4x4f64(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f64_4x4x4f64' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f64_4x4x4f64(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f64_4x4x4f64' must be a constant integer}}
+}

diff  --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index f9e1ca293d88..141bd2f08c18 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -354,6 +354,13 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
                                                                                                         Add product
                                                                                                         names.
 
+     ``gfx90a``                  ``amdgcn``   dGPU  - sramecc         - Absolute      - *rocm-amdhsa* *TBA*
+                                                    - tgsplit           flat
+                                                    - xnack             scratch                       .. TODO::
+                                                                      - Packed
+                                                                        work-item                       Add product
+                                                                        IDs                             names.
+
      ``gfx90c``                  ``amdgcn``   APU   - xnack           - Absolute      - *pal-amdpal*  - Ryzen 7 4700G
                                                                         flat                          - Ryzen 7 4700GE
                                                                         scratch                       - Ryzen 5 4600G
@@ -481,6 +488,11 @@ For example:
                                                   loaded and executed in a process with either
                                                   setting of SRAMECC.
 
+     tgsplit           ``-m[no-]tgsplit``         Enable/disable generating code that assumes
+                                                  work-groups are launched in threadgroup split mode.
+                                                  When enabled the waves of a work-group may be
+                                                  launched in 
diff erent CUs.
+
      wavefrontsize64 - ``-m[no-]wavefrontsize64`` Control the wavefront size used when
                                                   generating code for kernels. When disabled
                                                   native wavefront size 32 is used, when enabled
@@ -1119,6 +1131,7 @@ The AMDGPU backend uses the following ELF header:
      ``EF_AMDGPU_MACH_AMDGCN_GFX906``     0x02f      ``gfx906``
      ``EF_AMDGPU_MACH_AMDGCN_GFX908``     0x030      ``gfx908``
      ``EF_AMDGPU_MACH_AMDGCN_GFX909``     0x031      ``gfx909``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX90A``     0x03f      ``gfx90a``
      ``EF_AMDGPU_MACH_AMDGCN_GFX90C``     0x032      ``gfx90c``
      ``EF_AMDGPU_MACH_AMDGCN_GFX1010``    0x033      ``gfx1010``
      ``EF_AMDGPU_MACH_AMDGCN_GFX1011``    0x034      ``gfx1011``
@@ -3594,6 +3607,14 @@ The fields used by CP for code objects before V3 also match those specified in
              bytes
      383:352 4 bytes COMPUTE_PGM_RSRC3               GFX6-GFX9
                                                        Reserved, must be 0.
+                                                     GFX90A
+                                                       Compute Shader (CS)
+                                                       program settings used by
+                                                       CP to set up
+                                                       ``COMPUTE_PGM_RSRC3``
+                                                       configuration
+                                                       register. See
+                                                       :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx90a-table`.
                                                      GFX10
                                                        Compute Shader (CS)
                                                        program settings used by
@@ -3672,6 +3693,11 @@ The fields used by CP for code objects before V3 also match those specified in
                                                      GFX6-GFX9
                                                        - vgprs_used 0..256
                                                        - max(0, ceil(vgprs_used / 4) - 1)
+                                                     GFX90A
+                                                       - vgprs_used 0..512
+                                                       - vgprs_used = align(arch_vgprs, 4)
+                                                                      + acc_vgprs
+                                                       - max(0, ceil(vgprs_used / 8) - 1)
                                                      GFX10 (wavefront size 64)
                                                        - max_vgpr 1..256
                                                        - max(0, ceil(vgprs_used / 4) - 1)
@@ -4099,6 +4125,29 @@ The fields used by CP for code objects before V3 also match those specified in
      32      **Total size 4 bytes.**
      ======= ===================================================================================================================
 
+..
+
+  .. table:: compute_pgm_rsrc3 for GFX90A
+     :name: amdgpu-amdhsa-compute_pgm_rsrc3-gfx90a-table
+
+     ======= ======= =============================== ===========================================================================
+     Bits    Size    Field Name                      Description
+     ======= ======= =============================== ===========================================================================
+     5:0     6 bits  ACCUM_OFFSET                    Offset of a first AccVGPR in the unified register file. Granularity 4.
+                                                     Value 0-63. 0 - accum-offset = 4, 1 - accum-offset = 8, ...,
+                                                     63 - accum-offset = 256.
+     6:15    10                                      Reserved, must be 0.
+             bits
+     16      1 bit   TG_SPLIT                        - If 0 the waves of a work-group are
+                                                       launched in the same CU.
+                                                     - If 1 the waves of a work-group can be
+                                                       launched in 
diff erent CUs. The waves
+                                                       cannot use S_BARRIER or LDS.
+     17:31   15                                      Reserved, must be 0.
+             bits
+     32      **Total size 4 bytes.**
+     ======= ===================================================================================================================
+
 ..
 
   .. table:: compute_pgm_rsrc3 for GFX10
@@ -4295,11 +4344,19 @@ for enabled registers are dense starting at VGPR0: the first enabled register is
 VGPR0, the next enabled register is VGPR1 etc.; disabled registers do not have a
 VGPR number.
 
-VGPR register initial state is defined in
-:ref:`amdgpu-amdhsa-vgpr-register-set-up-order-table`.
+There are 
diff erent methods used for the VGPR initial state:
 
-  .. table:: VGPR Register Set Up Order
-     :name: amdgpu-amdhsa-vgpr-register-set-up-order-table
+* Unless the *Target Properties* column of :ref:`amdgpu-processor-table`
+  specifies otherwise, a separate VGPR register is used per work-item ID. The
+  VGPR register initial state for this method is defined in
+  :ref:`amdgpu-amdhsa-vgpr-register-set-up-order-for-unpacked-work-item-id-method-table`.
+* If *Target Properties* column of :ref:`amdgpu-processor-table`
+  specifies *Packed work-item IDs*, the initial value of VGPR0 register is used
+  for all work-item IDs. The register layout for this method is defined in
+  :ref:`amdgpu-amdhsa-register-layout-for-packed-work-item-id-method-table`.
+
+  .. table:: VGPR Register Set Up Order for Unpacked Work-Item ID Method
+     :name: amdgpu-amdhsa-vgpr-register-set-up-order-for-unpacked-work-item-id-method-table
 
      ========== ========================== ====== ==============================
      VGPR Order Name                       Number Description
@@ -4317,6 +4374,35 @@ VGPR register initial state is defined in
                 > 1)                              wavefront lane.
      ========== ========================== ====== ==============================
 
+..
+
+  .. table:: Register Layout for Packed Work-Item ID Method
+     :name: amdgpu-amdhsa-register-layout-for-packed-work-item-id-method-table
+
+     ======= ======= ================ =========================================
+     Bits    Size    Field Name       Description
+     ======= ======= ================ =========================================
+     0:9     10 bits Work-Item Id X   Work-item id in X
+                                      dimension of work-group for
+                                      wavefront lane.
+
+                                      Always initialized.
+
+     10:19   10 bits Work-Item Id Y   Work-item id in Y
+                                      dimension of work-group for
+                                      wavefront lane.
+
+                                      Initialized if enable_vgpr_workitem_id >
+                                      0, otherwise set to 0.
+     20:29   10 bits Work-Item Id Z   Work-item id in Z
+                                      dimension of work-group for
+                                      wavefront lane.
+
+                                      Initialized if enable_vgpr_workitem_id >
+                                      1, otherwise set to 0.
+     30:31   2 bits                   Reserved, set to 0.
+     ======= ======= ================ =========================================
+
 The setting of registers is done by GPU CP/ADC/SPI hardware as follows:
 
 1. SGPRs before the Work-Group Ids are set by CP using the 16 User Data
@@ -4651,6 +4737,7 @@ The code sequences used to implement the memory model are defined in the
 following sections:
 
 * :ref:`amdgpu-amdhsa-memory-model-gfx6-gfx9`
+* :ref:`amdgpu-amdhsa-memory-model-gfx90a`
 * :ref:`amdgpu-amdhsa-memory-model-gfx10`
 
 .. _amdgpu-amdhsa-memory-model-gfx6-gfx9:
@@ -5914,81 +6001,93 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`.
                                - system                  for OpenCL.*
      ============ ============ ============== ========== ================================
 
-.. _amdgpu-amdhsa-memory-model-gfx10:
+.. _amdgpu-amdhsa-memory-model-gfx90a:
 
-Memory Model GFX10
-++++++++++++++++++
+Memory Model GFX90A
++++++++++++++++++++
 
-For GFX10:
+For GFX90A:
 
 * Each agent has multiple shader arrays (SA).
-* Each SA has multiple work-group processors (WGP).
-* Each WGP has multiple compute units (CU).
+* Each SA has multiple compute units (CU).
 * Each CU has multiple SIMDs that execute wavefronts.
-* The wavefronts for a single work-group are executed in the same
-  WGP. In CU wavefront execution mode the wavefronts may be executed by
-  
diff erent SIMDs in the same CU. In WGP wavefront execution mode the
-  wavefronts may be executed by 
diff erent SIMDs in 
diff erent CUs in the same
-  WGP.
-* Each WGP has a single LDS memory shared by the wavefronts of the work-groups
-  executing on it.
-* All LDS operations of a WGP are performed as wavefront wide operations in a
+* The wavefronts for a single work-group are executed in the same CU but may be
+  executed by 
diff erent SIMDs. The exception is when in tgsplit execution mode
+  when the wavefronts may be executed by 
diff erent SIMDs in 
diff erent CUs.
+* Each CU has a single LDS memory shared by the wavefronts of the work-groups
+  executing on it. The exception is when in tgsplit execution mode when no LDS
+  is allocated as wavefronts of the same work-group can be in 
diff erent CUs.
+* All LDS operations of a CU are performed as wavefront wide operations in a
   global order and involve no caching. Completion is reported to a wavefront in
   execution order.
 * The LDS memory has multiple request queues shared by the SIMDs of a
-  WGP. Therefore, the LDS operations performed by 
diff erent wavefronts of a
+  CU. Therefore, the LDS operations performed by 
diff erent wavefronts of a
   work-group can be reordered relative to each other, which can result in
   reordering the visibility of vector memory operations with respect to LDS
   operations of other wavefronts in the same work-group. A ``s_waitcnt
   lgkmcnt(0)`` is required to ensure synchronization between LDS operations and
   vector memory operations between wavefronts of a work-group, but not between
   operations performed by the same wavefront.
-* The vector memory operations are performed as wavefront wide operations.
-  Completion of load/store/sample operations are reported to a wavefront in
-  execution order of other load/store/sample operations performed by that
-  wavefront.
-* The vector memory operations access a vector L0 cache. There is a single L0
-  cache per CU. Each SIMD of a CU accesses the same L0 cache. Therefore, no
-  special action is required for coherence between the lanes of a single
-  wavefront. However, a ``buffer_gl0_inv`` is required for coherence between
-  wavefronts executing in the same work-group as they may be executing on SIMDs
-  of 
diff erent CUs that access 
diff erent L0s. A ``buffer_gl0_inv`` is also
-  required for coherence between wavefronts executing in 
diff erent work-groups
-  as they may be executing on 
diff erent WGPs.
-* The scalar memory operations access a scalar L0 cache shared by all wavefronts
-  on a WGP. The scalar and vector L0 caches are not coherent. However, scalar
-  operations are used in a restricted way so do not impact the memory model. See
-  :ref:`amdgpu-amdhsa-memory-spaces`.
-* The vector and scalar memory L0 caches use an L1 cache shared by all WGPs on
-  the same SA. Therefore, no special action is required for coherence between
-  the wavefronts of a single work-group. However, a ``buffer_gl1_inv`` is
-  required for coherence between wavefronts executing in 
diff erent work-groups
-  as they may be executing on 
diff erent SAs that access 
diff erent L1s.
-* The L1 caches have independent quadrants to service disjoint ranges of virtual
-  addresses.
-* Each L0 cache has a separate request queue per L1 quadrant. Therefore, the
-  vector and scalar memory operations performed by 
diff erent wavefronts, whether
-  executing in the same or 
diff erent work-groups (which may be executing on
-  
diff erent CUs accessing 
diff erent L0s), can be reordered relative to each
-  other. A ``s_waitcnt vmcnt(0) & vscnt(0)`` is required to ensure
-  synchronization between vector memory operations of 
diff erent wavefronts. It
-  ensures a previous vector memory operation has completed before executing a
-  subsequent vector memory or LDS operation and so can be used to meet the
-  requirements of acquire, release and sequential consistency.
-* The L1 caches use an L2 cache shared by all SAs on the same agent.
-* The L2 cache has independent channels to service disjoint ranges of virtual
-  addresses.
-* Each L1 quadrant of a single SA accesses a 
diff erent L2 channel. Each L1
-  quadrant has a separate request queue per L2 channel. Therefore, the vector
-  and scalar memory operations performed by wavefronts executing in 
diff erent
-  work-groups (which may be executing on 
diff erent SAs) of an agent can be
-  reordered relative to each other. A ``s_waitcnt vmcnt(0) & vscnt(0)`` is
-  required to ensure synchronization between vector memory operations of
-  
diff erent SAs. It ensures a previous vector memory operation has completed
-  before executing a subsequent vector memory and so can be used to meet the
-  requirements of acquire, release and sequential consistency.
-* The L2 cache can be kept coherent with other agents on some targets, or ranges
-  of virtual addresses can be set up to bypass it to ensure system coherence.
+* The vector memory operations are performed as wavefront wide operations and
+  completion is reported to a wavefront in execution order. The exception is
+  that ``flat_load/store/atomic`` instructions can report out of vector memory
+  order if they access LDS memory, and out of LDS operation order if they access
+  global memory.
+* The vector memory operations access a single vector L1 cache shared by all
+  SIMDs a CU. Therefore:
+
+  * No special action is required for coherence between the lanes of a single
+    wavefront.
+
+  * No special action is required for coherence between wavefronts in the same
+    work-group since they exeute on the same CU. The exception is when in
+    tgsplit execution mode as wavefronts of the same work-group can be in
+    
diff erent CUs and so a ``buffer_wbinvl1_vol`` is required as described in
+    the following item.
+
+  * A ``buffer_wbinvl1_vol`` is required for coherence between wavefronts
+    executing in 
diff erent work-groups as they may be executing on 
diff erent
+    CUs.
+
+* The scalar memory operations access a scalar L1 cache shared by all wavefronts
+  on a group of CUs. The scalar and vector L1 caches are not coherent. However,
+  scalar operations are used in a restricted way so do not impact the memory
+  model. See :ref:`amdgpu-amdhsa-memory-spaces`.
+* The vector and scalar memory operations use an L2 cache shared by all CUs on
+  the same agent.
+
+  * The L2 cache has independent channels to service disjoint ranges of virtual
+    addresses.
+  * Each CU has a separate request queue per channel. Therefore, the vector and
+    scalar memory operations performed by wavefronts executing in 
diff erent
+    work-groups (which may be executing on 
diff erent CUs), or the same
+    work-group if executing in tgsplit mode, of an agent can be reordered
+    relative to each other. A ``s_waitcnt vmcnt(0)`` is required to ensure
+    synchronization between vector memory operations of 
diff erent CUs. It
+    ensures a previous vector memory operation has completed before executing a
+    subsequent vector memory or LDS operation and so can be used to meet the
+    requirements of acquire and release.
+  * The L2 cache of one agent can be kept coherent with other agents by using
+    the MTYPE RW (read-write) for memory local to the L2, and MTYPE NC
+    (non-coherent) with the PTE C-bit set for memory not local to the L2.
+
+    * Any local memory cache lines will be automatically invalidated by writes
+      from CUs associated with other L2 caches, or writes from the CPU, due to
+      the cache probe caused by the PTE C-bit.
+    * XGMI accesses from the CPU to local memory may be cached on the CPU.
+      Subsequent access from the GPU will automatically invalidate or writeback
+      the CPU cache due to the L2 probe filter.
+    * Since all work-groups on the same agent share the same L2, no L2
+      invalidation or writeback is required for coherence.
+    * To ensure coherence of local memory writes of work-groups in 
diff erent
+      agents a ``buffer_wbl2`` is required. It will writeback dirty L2 cache
+      lines.
+    * To ensure coherence of local memory reads of work-groups in 
diff erent
+      agents a ``buffer_invl2`` is required. It will invalidate non-local L2
+      cache lines.
+
+  * PCIe access from the GPU to the CPU memory can be kept coherent by using the
+    MTYPE UC (uncached) which bypasses the L2.
 
 Scalar memory operations are only used to access memory that is proven to not
 change during the execution of the kernel dispatch. This includes constant
@@ -6010,49 +6109,29 @@ creates a frame at the same address, respectively. There is no need for a
 
 For kernarg backing memory:
 
-* CP invalidates the L0 and L1 caches at the start of each kernel dispatch.
-* On dGPU the kernarg backing memory is accessed as MTYPE UC (uncached) to avoid
-  needing to invalidate the L2 cache.
+* CP invalidates the L1 cache at the start of each kernel dispatch.
+* On dGPU over XGMI or PCIe the kernarg backing memory is allocated in host
+  memory accessed as MTYPE UC (uncached) to avoid needing to invalidate the L2
+  cache. This also causes it to be treated as non-volatile and so is not
+  invalidated by ``*_vol``.
 * On APU the kernarg backing memory is accessed as MTYPE CC (cache coherent) and
   so the L2 cache will be coherent with the CPU and other agents.
 
 Scratch backing memory (which is used for the private address space) is accessed
-with MTYPE NC (non-coherent). Since the private address space is only accessed
-by a single thread, and is always write-before-read, there is never a need to
-invalidate these entries from the L0 or L1 caches.
-
-Wavefronts are executed in native mode with in-order reporting of loads and
-sample instructions. In this mode vmcnt reports completion of load, atomic with
-return and sample instructions in order, and the vscnt reports the completion of
-store and atomic without return in order. See ``MEM_ORDERED`` field in
-:ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx10-table`.
-
-Wavefronts can be executed in WGP or CU wavefront execution mode:
-
-* In WGP wavefront execution mode the wavefronts of a work-group are executed
-  on the SIMDs of both CUs of the WGP. Therefore, explicit management of the per
-  CU L0 caches is required for work-group synchronization. Also accesses to L1
-  at work-group scope need to be explicitly ordered as the accesses from
-  
diff erent CUs are not ordered.
-* In CU wavefront execution mode the wavefronts of a work-group are executed on
-  the SIMDs of a single CU of the WGP. Therefore, all global memory access by
-  the work-group access the same L0 which in turn ensures L1 accesses are
-  ordered and so do not require explicit management of the caches for
-  work-group synchronization.
-
-See ``WGP_MODE`` field in
-:ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx10-table` and
-:ref:`amdgpu-target-features`.
+with MTYPE NC_NV (non-coherent non-volatile). Since the private address space is
+only accessed by a single thread, and is always write-before-read, there is
+never a need to invalidate these entries from the L1 cache. Hence all cache
+invalidates are done as ``*_vol`` to only invalidate the volatile cache lines.
 
-The code sequences used to implement the memory model for GFX10 are defined in
-table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-table`.
+The code sequences used to implement the memory model for GFX90A are defined
+in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table`.
 
-  .. table:: AMDHSA Memory Model Code Sequences GFX10
-     :name: amdgpu-amdhsa-memory-model-code-sequences-gfx10-table
+  .. table:: AMDHSA Memory Model Code Sequences GFX90A
+     :name: amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table
 
      ============ ============ ============== ========== ================================
      LLVM Instr   LLVM Memory  LLVM Memory    AMDGPU     AMDGPU Machine Code
-                  Ordering     Sync Scope     Address    GFX10
+                  Ordering     Sync Scope     Address    GFX90A
                                               Space
      ============ ============ ============== ========== ================================
      **Non-Atomic**
@@ -6064,12 +6143,12 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-table`.
                                                          - !volatile & nontemporal
 
                                                            1. buffer/global/flat_load
-                                                              slc=1
+                                                              glc=1 slc=1
 
                                                          - volatile
 
                                                            1. buffer/global/flat_load
-                                                              glc=1 dlc=1
+                                                              glc=1 scc=1
                                                            2. s_waitcnt vmcnt(0)
 
                                                             - Must happen before
@@ -6091,13 +6170,14 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-table`.
                                               - constant
                                                          - !volatile & nontemporal
 
-                                                            1. buffer/global/flat_store
-                                                               slc=1
+                                                           1. buffer/global/flat_store
+                                                              glc=1 slc=1
 
                                                          - volatile
 
                                                            1. buffer/global/flat_store
-                                                           2. s_waitcnt vscnt(0)
+                                                              scc=1
+                                                           2. s_waitcnt vmcnt(0)
 
                                                             - Must happen before
                                                               any following volatile
@@ -6124,30 +6204,40 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-table`.
      load atomic  monotonic    - workgroup    - global   1. buffer/global/flat_load
                                               - generic     glc=1
 
-                                                           - If CU wavefront execution
+                                                           - If not TgSplit execution
                                                              mode, omit glc=1.
 
-     load atomic  monotonic    - singlethread - local    1. ds_load
-                               - wavefront
-                               - workgroup
+     load atomic  monotonic    - singlethread - local    *If TgSplit execution mode,
+                               - wavefront               local address space cannot
+                               - workgroup               be used.*
+
+                                                         1. ds_load
      load atomic  monotonic    - agent        - global   1. buffer/global/flat_load
-                               - system       - generic     glc=1 dlc=1
+                                              - generic     glc=1
+     load atomic  monotonic    - system       - global   1. buffer/global/flat_load
+                                              - generic     glc=1 scc=1
      store atomic monotonic    - singlethread - global   1. buffer/global/flat_store
                                - wavefront    - generic
                                - workgroup
                                - agent
-                               - system
-     store atomic monotonic    - singlethread - local    1. ds_store
-                               - wavefront
-                               - workgroup
+     store atomic monotonic    - system       - global   1. buffer/global/flat_store
+                                              - generic     scc=1
+     store atomic monotonic    - singlethread - local    *If TgSplit execution mode,
+                               - wavefront               local address space cannot
+                               - workgroup               be used.*
+
+                                                         1. ds_store
      atomicrmw    monotonic    - singlethread - global   1. buffer/global/flat_atomic
                                - wavefront    - generic
                                - workgroup
                                - agent
-                               - system
-     atomicrmw    monotonic    - singlethread - local    1. ds_atomic
-                               - wavefront
-                               - workgroup
+     atomicrmw    monotonic    - system       - global   1. buffer/global/flat_atomic
+                                              - generic     scc=1
+     atomicrmw    monotonic    - singlethread - local    *If TgSplit execution mode,
+                               - wavefront               local address space cannot
+                               - workgroup               be used.*
+
+                                                         1. ds_atomic
      **Acquire Atomic**
      ------------------------------------------------------------------------------------
      load atomic  acquire      - singlethread - global   1. buffer/global/ds/flat_load
@@ -6155,38 +6245,43 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-table`.
                                               - generic
      load atomic  acquire      - workgroup    - global   1. buffer/global_load glc=1
 
-                                                           - If CU wavefront execution
+                                                           - If not TgSplit execution
                                                              mode, omit glc=1.
 
                                                          2. s_waitcnt vmcnt(0)
 
-                                                           - If CU wavefront execution
+                                                           - If not TgSplit execution
+                                                             mode, omit.
+                                                           - Must happen before the
+                                                             following buffer_wbinvl1_vol.
+
+                                                         3. buffer_wbinvl1_vol
+
+                                                           - If not TgSplit execution
                                                              mode, omit.
                                                            - Must happen before
-                                                             the following buffer_gl0_inv
-                                                             and before any following
+                                                             any following
                                                              global/generic
                                                              load/load
                                                              atomic/store/store
                                                              atomic/atomicrmw.
-
-                                                         3. buffer_gl0_inv
-
-                                                           - If CU wavefront execution
-                                                             mode, omit.
                                                            - Ensures that
                                                              following
                                                              loads will not see
                                                              stale data.
 
-     load atomic  acquire      - workgroup    - local    1. ds_load
+     load atomic  acquire      - workgroup    - local    *If TgSplit execution mode,
+                                                         local address space cannot
+                                                         be used.*
+
+                                                         1. ds_load
                                                          2. s_waitcnt lgkmcnt(0)
 
                                                            - If OpenCL, omit.
                                                            - Must happen before
-                                                             the following buffer_gl0_inv
-                                                             and before any following
-                                                             global/generic load/load
+                                                             any following
+                                                             global/generic
+                                                             load/load
                                                              atomic/store/store
                                                              atomic/atomicrmw.
                                                            - Ensures any
@@ -6196,31 +6291,21 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-table`.
                                                              atomic value being
                                                              acquired.
 
-                                                         3. buffer_gl0_inv
-
-                                                           - If CU wavefront execution
-                                                             mode, omit.
-                                                           - If OpenCL, omit.
-                                                           - Ensures that
-                                                             following
-                                                             loads will not see
-                                                             stale data.
-
      load atomic  acquire      - workgroup    - generic  1. flat_load glc=1
 
-                                                           - If CU wavefront execution
+                                                           - If not TgSplit execution
                                                              mode, omit glc=1.
 
-                                                         2. s_waitcnt lgkmcnt(0) &
-                                                            vmcnt(0)
+                                                         2. s_waitcnt lgkm/vmcnt(0)
 
-                                                           - If CU wavefront execution
-                                                             mode, omit vmcnt(0).
-                                                           - If OpenCL, omit
-                                                             lgkmcnt(0).
+                                                           - Use lgkmcnt(0) if not
+                                                             TgSplit execution mode
+                                                             and vmcnt(0) if TgSplit
+                                                             execution mode.
+                                                           - If OpenCL, omit lgkmcnt(0).
                                                            - Must happen before
                                                              the following
-                                                             buffer_gl0_inv and any
+                                                             buffer_wbinvl1_vol and any
                                                              following global/generic
                                                              load/load
                                                              atomic/store/store
@@ -6232,9 +6317,9 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-table`.
                                                              atomic value being
                                                              acquired.
 
-                                                         3. buffer_gl0_inv
+                                                         3. buffer_wbinvl1_vol
 
-                                                           - If CU wavefront execution
+                                                           - If not TgSplit execution
                                                              mode, omit.
                                                            - Ensures that
                                                              following
@@ -6242,19 +6327,18 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-table`.
                                                              stale data.
 
      load atomic  acquire      - agent        - global   1. buffer/global_load
-                               - system                     glc=1 dlc=1
+                                                            glc=1
                                                          2. s_waitcnt vmcnt(0)
 
                                                            - Must happen before
                                                              following
-                                                             buffer_gl*_inv.
+                                                             buffer_wbinvl1_vol.
                                                            - Ensures the load
                                                              has completed
                                                              before invalidating
-                                                             the caches.
+                                                             the cache.
 
-                                                         3. buffer_gl0_inv;
-                                                            buffer_gl1_inv
+                                                         3. buffer_wbinvl1_vol
 
                                                            - Must happen before
                                                              any following
@@ -6266,22 +6350,51 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-table`.
                                                              loads will not see
                                                              stale global data.
 
-     load atomic  acquire      - agent        - generic  1. flat_load glc=1 dlc=1
-                               - system                  2. s_waitcnt vmcnt(0) &
+     load atomic  acquire      - system       - global   1. buffer/global/flat_load
+                                                            glc=1 scc=1
+                                                         2. s_waitcnt vmcnt(0)
+
+                                                           - Must happen before
+                                                             following buffer_invl2 and
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures the load
+                                                             has completed
+                                                             before invalidating
+                                                             the cache.
+
+                                                         3. buffer_invl2;
+                                                            buffer_wbinvl1_vol
+
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following
+                                                             loads will not see
+                                                             stale MTYPE NC global data.
+                                                             MTYPE RW and CC memory will
+                                                             never be stale due to the
+                                                             memory probes.
+
+     load atomic  acquire      - agent        - generic  1. flat_load glc=1
+                                                         2. s_waitcnt vmcnt(0) &
                                                             lgkmcnt(0)
 
+                                                           - If TgSplit execution mode,
+                                                             omit lgkmcnt(0).
                                                            - If OpenCL omit
                                                              lgkmcnt(0).
                                                            - Must happen before
                                                              following
-                                                             buffer_gl*_invl.
+                                                             buffer_wbinvl1_vol.
                                                            - Ensures the flat_load
                                                              has completed
                                                              before invalidating
-                                                             the caches.
+                                                             the cache.
 
-                                                         3. buffer_gl0_inv;
-                                                            buffer_gl1_inv
+                                                         3. buffer_wbinvl1_vol
 
                                                            - Must happen before
                                                              any following
@@ -6293,8 +6406,2345 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-table`.
                                                              will not see stale
                                                              global data.
 
-     atomicrmw    acquire      - singlethread - global   1. buffer/global/ds/flat_atomic
-                               - wavefront    - local
+     load atomic  acquire      - system       - generic  1. flat_load glc=1 scc=1
+                                                         2. s_waitcnt vmcnt(0) &
+                                                            lgkmcnt(0)
+
+                                                           - If TgSplit execution mode,
+                                                             omit lgkmcnt(0).
+                                                           - If OpenCL omit
+                                                             lgkmcnt(0).
+                                                           - Must happen before
+                                                             following
+                                                             buffer_invl2 and
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures the flat_load
+                                                             has completed
+                                                             before invalidating
+                                                             the caches.
+
+                                                         3. buffer_invl2;
+                                                            buffer_wbinvl1_vol
+
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following
+                                                             loads will not see
+                                                             stale MTYPE NC global data.
+                                                             MTYPE RW and CC memory will
+                                                             never be stale due to the
+                                                             memory probes.
+
+     atomicrmw    acquire      - singlethread - global   1. buffer/global/flat_atomic
+                               - wavefront    - generic
+     atomicrmw    acquire      - singlethread - local    *If TgSplit execution mode,
+                               - wavefront               local address space cannot
+                                                         be used.*
+
+                                                         1. ds_atomic
+     atomicrmw    acquire      - workgroup    - global   1. buffer/global_atomic
+                                                         2. s_waitcnt vmcnt(0)
+
+                                                           - If not TgSplit execution
+                                                             mode, omit.
+                                                           - Must happen before the
+                                                             following buffer_wbinvl1_vol.
+                                                           - Ensures the atomicrmw
+                                                             has completed
+                                                             before invalidating
+                                                             the cache.
+
+                                                         3. buffer_wbinvl1_vol
+
+                                                           - If not TgSplit execution
+                                                             mode, omit.
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following loads
+                                                             will not see stale
+                                                             global data.
+
+     atomicrmw    acquire      - workgroup    - local    *If TgSplit execution mode,
+                                                         local address space cannot
+                                                         be used.*
+
+                                                         1. ds_atomic
+                                                         2. s_waitcnt lgkmcnt(0)
+
+                                                           - If OpenCL, omit.
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures any
+                                                             following global
+                                                             data read is no
+                                                             older than the local
+                                                             atomicrmw value
+                                                             being acquired.
+
+     atomicrmw    acquire      - workgroup    - generic  1. flat_atomic
+                                                         2. s_waitcnt lgkm/vmcnt(0)
+
+                                                           - Use lgkmcnt(0) if not
+                                                             TgSplit execution mode
+                                                             and vmcnt(0) if TgSplit
+                                                             execution mode.
+                                                           - If OpenCL, omit lgkmcnt(0).
+                                                           - Must happen before
+                                                             the following
+                                                             buffer_wbinvl1_vol and
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures any
+                                                             following global
+                                                             data read is no
+                                                             older than a local
+                                                             atomicrmw value
+                                                             being acquired.
+
+                                                         3. buffer_wbinvl1_vol
+
+                                                           - If not TgSplit execution
+                                                             mode, omit.
+                                                           - Ensures that
+                                                             following
+                                                             loads will not see
+                                                             stale data.
+
+     atomicrmw    acquire      - agent        - global   1. buffer/global_atomic
+                                                         2. s_waitcnt vmcnt(0)
+
+                                                           - Must happen before
+                                                             following
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures the
+                                                             atomicrmw has
+                                                             completed before
+                                                             invalidating the
+                                                             cache.
+
+                                                         3. buffer_wbinvl1_vol
+
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following loads
+                                                             will not see stale
+                                                             global data.
+
+     atomicrmw    acquire      - system       - global   1. buffer/global_atomic
+                                                            scc=1
+                                                         2. s_waitcnt vmcnt(0)
+
+                                                           - Must happen before
+                                                             following buffer_invl2 and
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures the
+                                                             atomicrmw has
+                                                             completed before
+                                                             invalidating the
+                                                             caches.
+
+                                                         3. buffer_invl2;
+                                                            buffer_wbinvl1_vol
+
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following
+                                                             loads will not see
+                                                             stale MTYPE NC global data.
+                                                             MTYPE RW and CC memory will
+                                                             never be stale due to the
+                                                             memory probes.
+
+     atomicrmw    acquire      - agent        - generic  1. flat_atomic
+                                                         2. s_waitcnt vmcnt(0) &
+                                                            lgkmcnt(0)
+
+                                                           - If TgSplit execution mode,
+                                                             omit lgkmcnt(0).
+                                                           - If OpenCL, omit
+                                                             lgkmcnt(0).
+                                                           - Must happen before
+                                                             following
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures the
+                                                             atomicrmw has
+                                                             completed before
+                                                             invalidating the
+                                                             cache.
+
+                                                         3. buffer_wbinvl1_vol
+
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following loads
+                                                             will not see stale
+                                                             global data.
+
+     atomicrmw    acquire      - system       - generic  1. flat_atomic scc=1
+                                                         2. s_waitcnt vmcnt(0) &
+                                                            lgkmcnt(0)
+
+                                                           - If TgSplit execution mode,
+                                                             omit lgkmcnt(0).
+                                                           - If OpenCL, omit
+                                                             lgkmcnt(0).
+                                                           - Must happen before
+                                                             following
+                                                             buffer_invl2 and
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures the
+                                                             atomicrmw has
+                                                             completed before
+                                                             invalidating the
+                                                             caches.
+
+                                                         3. buffer_invl2;
+                                                            buffer_wbinvl1_vol
+
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following
+                                                             loads will not see
+                                                             stale MTYPE NC global data.
+                                                             MTYPE RW and CC memory will
+                                                             never be stale due to the
+                                                             memory probes.
+
+     fence        acquire      - singlethread *none*     *none*
+                               - wavefront
+     fence        acquire      - workgroup    *none*     1. s_waitcnt lgkm/vmcnt(0)
+
+                                                           - Use lgkmcnt(0) if not
+                                                             TgSplit execution mode
+                                                             and vmcnt(0) if TgSplit
+                                                             execution mode.
+                                                           - If OpenCL and
+                                                             address space is
+                                                             not generic, omit
+                                                             lgkmcnt(0).
+                                                           - If OpenCL and
+                                                             address space is
+                                                             local, omit
+                                                             vmcnt(0).
+                                                           - However, since LLVM
+                                                             currently has no
+                                                             address space on
+                                                             the fence need to
+                                                             conservatively
+                                                             always generate. If
+                                                             fence had an
+                                                             address space then
+                                                             set to address
+                                                             space of OpenCL
+                                                             fence flag, or to
+                                                             generic if both
+                                                             local and global
+                                                             flags are
+                                                             specified.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic load
+                                                             atomic/
+                                                             atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             fence-paired-atomic).
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic load
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             fence-paired-atomic).
+                                                           - Must happen before
+                                                             the following
+                                                             buffer_wbinvl1_vol and
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures any
+                                                             following global
+                                                             data read is no
+                                                             older than the
+                                                             value read by the
+                                                             fence-paired-atomic.
+
+                                                         3. buffer_wbinvl1_vol
+
+                                                           - If not TgSplit execution
+                                                             mode, omit.
+                                                           - Ensures that
+                                                             following
+                                                             loads will not see
+                                                             stale data.
+
+     fence        acquire      - agent        *none*     1. s_waitcnt lgkmcnt(0) &
+                                                            vmcnt(0)
+
+                                                           - If TgSplit execution mode,
+                                                             omit lgkmcnt(0).
+                                                           - If OpenCL and
+                                                             address space is
+                                                             not generic, omit
+                                                             lgkmcnt(0).
+                                                           - However, since LLVM
+                                                             currently has no
+                                                             address space on
+                                                             the fence need to
+                                                             conservatively
+                                                             always generate
+                                                             (see comment for
+                                                             previous fence).
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0) and
+                                                             s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic load
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             fence-paired-atomic).
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic load
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             fence-paired-atomic).
+                                                           - Must happen before
+                                                             the following
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures that the
+                                                             fence-paired atomic
+                                                             has completed
+                                                             before invalidating
+                                                             the
+                                                             cache. Therefore
+                                                             any following
+                                                             locations read must
+                                                             be no older than
+                                                             the value read by
+                                                             the
+                                                             fence-paired-atomic.
+
+                                                         2. buffer_wbinvl1_vol
+
+                                                           - Must happen before any
+                                                             following global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following loads
+                                                             will not see stale
+                                                             global data.
+
+     fence        acquire      - system       *none*     1. s_waitcnt lgkmcnt(0) &
+                                                            vmcnt(0)
+
+                                                           - If TgSplit execution mode,
+                                                             omit lgkmcnt(0).
+                                                           - If OpenCL and
+                                                             address space is
+                                                             not generic, omit
+                                                             lgkmcnt(0).
+                                                           - However, since LLVM
+                                                             currently has no
+                                                             address space on
+                                                             the fence need to
+                                                             conservatively
+                                                             always generate
+                                                             (see comment for
+                                                             previous fence).
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0) and
+                                                             s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic load
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             fence-paired-atomic).
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic load
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             fence-paired-atomic).
+                                                           - Must happen before
+                                                             the following buffer_invl2 and
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures that the
+                                                             fence-paired atomic
+                                                             has completed
+                                                             before invalidating
+                                                             the
+                                                             cache. Therefore
+                                                             any following
+                                                             locations read must
+                                                             be no older than
+                                                             the value read by
+                                                             the
+                                                             fence-paired-atomic.
+
+                                                         2. buffer_invl2;
+                                                            buffer_wbinvl1_vol
+
+                                                           - Must happen before any
+                                                             following global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following loads
+                                                             will not see stale
+                                                             global data.
+
+     **Release Atomic**
+     ------------------------------------------------------------------------------------
+     store atomic release      - singlethread - global   1. buffer/global/flat_store
+                               - wavefront    - generic
+     store atomic release      - singlethread - local    *If TgSplit execution mode,
+                               - wavefront               local address space cannot
+                                                         be used.*
+
+                                                         1. ds_store
+     store atomic release      - workgroup    - global   1. s_waitcnt lgkm/vmcnt(0)
+                                              - generic
+                                                           - Use lgkmcnt(0) if not
+                                                             TgSplit execution mode
+                                                             and vmcnt(0) if TgSplit
+                                                             execution mode.
+                                                           - If OpenCL, omit lgkmcnt(0).
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic load/store/
+                                                             load atomic/store atomic/
+                                                             atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             store.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             have
+                                                             completed before
+                                                             performing the
+                                                             store that is being
+                                                             released.
+
+                                                         2. buffer/global/flat_store
+     store atomic release      - workgroup    - local    *If TgSplit execution mode,
+                                                         local address space cannot
+                                                         be used.*
+
+                                                         1. ds_store
+     store atomic release      - agent        - global   1. s_waitcnt lgkmcnt(0) &
+                                              - generic     vmcnt(0)
+
+                                                           - If TgSplit execution mode,
+                                                             omit lgkmcnt(0).
+                                                           - If OpenCL and
+                                                             address space is
+                                                             not generic, omit
+                                                             lgkmcnt(0).
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0) and
+                                                             s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             store.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             to memory have
+                                                             completed before
+                                                             performing the
+                                                             store that is being
+                                                             released.
+
+                                                         2. buffer/global/flat_store
+     store atomic release      - system       - global   1. buffer_wbl2
+                                              - generic
+                                                           - Must happen before
+                                                             following s_waitcnt.
+                                                           - Performs L2 writeback to
+                                                             ensure previous
+                                                             global/generic
+                                                             store/atomicrmw are
+                                                             visible at system scope.
+
+                                                         2. s_waitcnt lgkmcnt(0) &
+                                                            vmcnt(0)
+
+                                                           - If TgSplit execution mode,
+                                                             omit lgkmcnt(0).
+                                                           - If OpenCL and
+                                                             address space is
+                                                             not generic, omit
+                                                             lgkmcnt(0).
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0) and
+                                                             s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after any
+                                                             preceding
+                                                             global/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after any
+                                                             preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             store.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             to memory and the L2
+                                                             writeback have
+                                                             completed before
+                                                             performing the
+                                                             store that is being
+                                                             released.
+
+                                                         2. buffer/global/flat_store
+                                                            scc=1
+     atomicrmw    release      - singlethread - global   1. buffer/global/flat_atomic
+                               - wavefront    - generic
+     atomicrmw    release      - singlethread - local    *If TgSplit execution mode,
+                               - wavefront               local address space cannot
+                                                         be used.*
+
+                                                         1. ds_atomic
+     atomicrmw    release      - workgroup    - global   1. s_waitcnt lgkm/vmcnt(0)
+                                              - generic
+                                                           - Use lgkmcnt(0) if not
+                                                             TgSplit execution mode
+                                                             and vmcnt(0) if TgSplit
+                                                             execution mode.
+                                                           - If OpenCL, omit
+                                                             lgkmcnt(0).
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic load/store/
+                                                             load atomic/store atomic/
+                                                             atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             atomicrmw.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             have
+                                                             completed before
+                                                             performing the
+                                                             atomicrmw that is
+                                                             being released.
+
+                                                         2. buffer/global/flat_atomic
+     atomicrmw    release      - workgroup    - local    *If TgSplit execution mode,
+                                                         local address space cannot
+                                                         be used.*
+
+                                                         1. ds_atomic
+     atomicrmw    release      - agent        - global   1. s_waitcnt lgkmcnt(0) &
+                                              - generic     vmcnt(0)
+
+                                                           - If TgSplit execution mode,
+                                                             omit lgkmcnt(0).
+                                                           - If OpenCL, omit
+                                                             lgkmcnt(0).
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0) and
+                                                             s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             atomicrmw.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             to global and local
+                                                             have completed
+                                                             before performing
+                                                             the atomicrmw that
+                                                             is being released.
+
+                                                         2. buffer/global/flat_atomic
+     atomicrmw    release      - system       - global   1. buffer_wbl2
+                                              - generic
+                                                           - Must happen before
+                                                             following s_waitcnt.
+                                                           - Performs L2 writeback to
+                                                             ensure previous
+                                                             global/generic
+                                                             store/atomicrmw are
+                                                             visible at system scope.
+
+                                                         2. s_waitcnt lgkmcnt(0) &
+                                                            vmcnt(0)
+
+                                                           - If TgSplit execution mode,
+                                                             omit lgkmcnt(0).
+                                                           - If OpenCL, omit
+                                                             lgkmcnt(0).
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0) and
+                                                             s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             atomicrmw.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             to memory and the L2
+                                                             writeback have
+                                                             completed before
+                                                             performing the
+                                                             store that is being
+                                                             released.
+
+                                                         3. buffer/global/flat_atomic
+                                                            scc=1
+     fence        release      - singlethread *none*     *none*
+                               - wavefront
+     fence        release      - workgroup    *none*     1. s_waitcnt lgkm/vmcnt(0)
+
+                                                           - Use lgkmcnt(0) if not
+                                                             TgSplit execution mode
+                                                             and vmcnt(0) if TgSplit
+                                                             execution mode.
+                                                           - If OpenCL and
+                                                             address space is
+                                                             not generic, omit
+                                                             lgkmcnt(0).
+                                                           - If OpenCL and
+                                                             address space is
+                                                             local, omit
+                                                             vmcnt(0).
+                                                           - However, since LLVM
+                                                             currently has no
+                                                             address space on
+                                                             the fence need to
+                                                             conservatively
+                                                             always generate. If
+                                                             fence had an
+                                                             address space then
+                                                             set to address
+                                                             space of OpenCL
+                                                             fence flag, or to
+                                                             generic if both
+                                                             local and global
+                                                             flags are
+                                                             specified.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic
+                                                             load/store/
+                                                             load atomic/store atomic/
+                                                             atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             any following store
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             fence-paired-atomic).
+                                                           - Ensures that all
+                                                             memory operations
+                                                             have
+                                                             completed before
+                                                             performing the
+                                                             following
+                                                             fence-paired-atomic.
+
+     fence        release      - agent        *none*     1. s_waitcnt lgkmcnt(0) &
+                                                            vmcnt(0)
+
+                                                           - If TgSplit execution mode,
+                                                             omit lgkmcnt(0).
+                                                           - If OpenCL and
+                                                             address space is
+                                                             not generic, omit
+                                                             lgkmcnt(0).
+                                                           - If OpenCL and
+                                                             address space is
+                                                             local, omit
+                                                             vmcnt(0).
+                                                           - However, since LLVM
+                                                             currently has no
+                                                             address space on
+                                                             the fence need to
+                                                             conservatively
+                                                             always generate. If
+                                                             fence had an
+                                                             address space then
+                                                             set to address
+                                                             space of OpenCL
+                                                             fence flag, or to
+                                                             generic if both
+                                                             local and global
+                                                             flags are
+                                                             specified.
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0) and
+                                                             s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             any following store
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             fence-paired-atomic).
+                                                           - Ensures that all
+                                                             memory operations
+                                                             have
+                                                             completed before
+                                                             performing the
+                                                             following
+                                                             fence-paired-atomic.
+
+     fence        release      - system       *none*     1. buffer_wbl2
+
+                                                           - If OpenCL and
+                                                             address space is
+                                                             local, omit.
+                                                           - Must happen before
+                                                             following s_waitcnt.
+                                                           - Performs L2 writeback to
+                                                             ensure previous
+                                                             global/generic
+                                                             store/atomicrmw are
+                                                             visible at system scope.
+
+                                                         2. s_waitcnt lgkmcnt(0) &
+                                                            vmcnt(0)
+
+                                                           - If TgSplit execution mode,
+                                                             omit lgkmcnt(0).
+                                                           - If OpenCL and
+                                                             address space is
+                                                             not generic, omit
+                                                             lgkmcnt(0).
+                                                           - If OpenCL and
+                                                             address space is
+                                                             local, omit
+                                                             vmcnt(0).
+                                                           - However, since LLVM
+                                                             currently has no
+                                                             address space on
+                                                             the fence need to
+                                                             conservatively
+                                                             always generate. If
+                                                             fence had an
+                                                             address space then
+                                                             set to address
+                                                             space of OpenCL
+                                                             fence flag, or to
+                                                             generic if both
+                                                             local and global
+                                                             flags are
+                                                             specified.
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0) and
+                                                             s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             any following store
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             fence-paired-atomic).
+                                                           - Ensures that all
+                                                             memory operations
+                                                             have
+                                                             completed before
+                                                             performing the
+                                                             following
+                                                             fence-paired-atomic.
+
+     **Acquire-Release Atomic**
+     ------------------------------------------------------------------------------------
+     atomicrmw    acq_rel      - singlethread - global   1. buffer/global/flat_atomic
+                               - wavefront    - generic
+     atomicrmw    acq_rel      - singlethread - local    *If TgSplit execution mode,
+                               - wavefront               local address space cannot
+                                                         be used.*
+
+                                                         1. ds_atomic
+     atomicrmw    acq_rel      - workgroup    - global   1. s_waitcnt lgkm/vmcnt(0)
+
+                                                           - Use lgkmcnt(0) if not
+                                                             TgSplit execution mode
+                                                             and vmcnt(0) if TgSplit
+                                                             execution mode.
+                                                           - If OpenCL, omit
+                                                             lgkmcnt(0).
+                                                           - Must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic load/store/
+                                                             load atomic/store atomic/
+                                                             atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             atomicrmw.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             have
+                                                             completed before
+                                                             performing the
+                                                             atomicrmw that is
+                                                             being released.
+
+                                                         2. buffer/global_atomic
+                                                         3. s_waitcnt vmcnt(0)
+
+                                                           - If not TgSplit execution
+                                                             mode, omit.
+                                                           - Must happen before
+                                                             the following
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures any
+                                                             following global
+                                                             data read is no
+                                                             older than the
+                                                             atomicrmw value
+                                                             being acquired.
+
+                                                         4. buffer_wbinvl1_vol
+
+                                                           - If not TgSplit execution
+                                                             mode, omit.
+                                                           - Ensures that
+                                                             following
+                                                             loads will not see
+                                                             stale data.
+
+     atomicrmw    acq_rel      - workgroup    - local    *If TgSplit execution mode,
+                                                         local address space cannot
+                                                         be used.*
+
+                                                         1. ds_atomic
+                                                         2. s_waitcnt lgkmcnt(0)
+
+                                                           - If OpenCL, omit.
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures any
+                                                             following global
+                                                             data read is no
+                                                             older than the local load
+                                                             atomic value being
+                                                             acquired.
+
+     atomicrmw    acq_rel      - workgroup    - generic  1. s_waitcnt lgkm/vmcnt(0)
+
+                                                           - Use lgkmcnt(0) if not
+                                                             TgSplit execution mode
+                                                             and vmcnt(0) if TgSplit
+                                                             execution mode.
+                                                           - If OpenCL, omit
+                                                             lgkmcnt(0).
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic load/store/
+                                                             load atomic/store atomic/
+                                                             atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             atomicrmw.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             have
+                                                             completed before
+                                                             performing the
+                                                             atomicrmw that is
+                                                             being released.
+
+                                                         2. flat_atomic
+                                                         3. s_waitcnt lgkmcnt(0) &
+                                                            vmcnt(0)
+
+                                                           - If not TgSplit execution
+                                                             mode, omit vmcnt(0).
+                                                           - If OpenCL, omit
+                                                             lgkmcnt(0).
+                                                           - Must happen before
+                                                             the following
+                                                             buffer_wbinvl1_vol and
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures any
+                                                             following global
+                                                             data read is no
+                                                             older than a local load
+                                                             atomic value being
+                                                             acquired.
+
+                                                         3. buffer_wbinvl1_vol
+
+                                                           - If not TgSplit execution
+                                                             mode, omit.
+                                                           - Ensures that
+                                                             following
+                                                             loads will not see
+                                                             stale data.
+
+     atomicrmw    acq_rel      - agent        - global   1. s_waitcnt lgkmcnt(0) &
+                                                            vmcnt(0)
+
+                                                           - If TgSplit execution mode,
+                                                             omit lgkmcnt(0).
+                                                           - If OpenCL, omit
+                                                             lgkmcnt(0).
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0) and
+                                                             s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             atomicrmw.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             to global have
+                                                             completed before
+                                                             performing the
+                                                             atomicrmw that is
+                                                             being released.
+
+                                                         2. buffer/global_atomic
+                                                         3. s_waitcnt vmcnt(0)
+
+                                                           - Must happen before
+                                                             following
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures the
+                                                             atomicrmw has
+                                                             completed before
+                                                             invalidating the
+                                                             cache.
+
+                                                         4. buffer_wbinvl1_vol
+
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following loads
+                                                             will not see stale
+                                                             global data.
+
+     atomicrmw    acq_rel      - system       - global   1. buffer_wbl2
+
+                                                           - Must happen before
+                                                             following s_waitcnt.
+                                                           - Performs L2 writeback to
+                                                             ensure previous
+                                                             global/generic
+                                                             store/atomicrmw are
+                                                             visible at system scope.
+
+                                                         2. s_waitcnt lgkmcnt(0) &
+                                                            vmcnt(0)
+
+                                                           - If TgSplit execution mode,
+                                                             omit lgkmcnt(0).
+                                                           - If OpenCL, omit
+                                                             lgkmcnt(0).
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0) and
+                                                             s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             atomicrmw.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             to global and L2 writeback
+                                                             have completed before
+                                                             performing the
+                                                             atomicrmw that is
+                                                             being released.
+
+                                                         3. buffer/global_atomic
+                                                            scc=1
+                                                         4. s_waitcnt vmcnt(0)
+
+                                                           - Must happen before
+                                                             following buffer_invl2 and
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures the
+                                                             atomicrmw has
+                                                             completed before
+                                                             invalidating the
+                                                             caches.
+
+                                                         5. buffer_invl2;
+                                                            buffer_wbinvl1_vol
+
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following loads
+                                                             will not see stale
+                                                             MTYPE NC global data.
+                                                             MTYPE RW and CC memory will
+                                                             never be stale due to the
+                                                             memory probes.
+
+     atomicrmw    acq_rel      - agent        - generic  1. s_waitcnt lgkmcnt(0) &
+                                                            vmcnt(0)
+
+                                                           - If TgSplit execution mode,
+                                                             omit lgkmcnt(0).
+                                                           - If OpenCL, omit
+                                                             lgkmcnt(0).
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0) and
+                                                             s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             atomicrmw.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             to global have
+                                                             completed before
+                                                             performing the
+                                                             atomicrmw that is
+                                                             being released.
+
+                                                         2. flat_atomic
+                                                         3. s_waitcnt vmcnt(0) &
+                                                            lgkmcnt(0)
+
+                                                           - If TgSplit execution mode,
+                                                             omit lgkmcnt(0).
+                                                           - If OpenCL, omit
+                                                             lgkmcnt(0).
+                                                           - Must happen before
+                                                             following
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures the
+                                                             atomicrmw has
+                                                             completed before
+                                                             invalidating the
+                                                             cache.
+
+                                                         4. buffer_wbinvl1_vol
+
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following loads
+                                                             will not see stale
+                                                             global data.
+
+     atomicrmw    acq_rel      - system       - generic  1. buffer_wbl2
+
+                                                           - Must happen before
+                                                             following s_waitcnt.
+                                                           - Performs L2 writeback to
+                                                             ensure previous
+                                                             global/generic
+                                                             store/atomicrmw are
+                                                             visible at system scope.
+
+                                                         2. s_waitcnt lgkmcnt(0) &
+                                                            vmcnt(0)
+
+                                                           - If TgSplit execution mode,
+                                                             omit lgkmcnt(0).
+                                                           - If OpenCL, omit
+                                                             lgkmcnt(0).
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0) and
+                                                             s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             atomicrmw.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             to global and L2 writeback
+                                                             have completed before
+                                                             performing the
+                                                             atomicrmw that is
+                                                             being released.
+
+                                                         3. flat_atomic scc=1
+                                                         4. s_waitcnt vmcnt(0) &
+                                                            lgkmcnt(0)
+
+                                                           - If TgSplit execution mode,
+                                                             omit lgkmcnt(0).
+                                                           - If OpenCL, omit
+                                                             lgkmcnt(0).
+                                                           - Must happen before
+                                                             following buffer_invl2 and
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures the
+                                                             atomicrmw has
+                                                             completed before
+                                                             invalidating the
+                                                             caches.
+
+                                                         5. buffer_invl2;
+                                                            buffer_wbinvl1_vol
+
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following loads
+                                                             will not see stale
+                                                             MTYPE NC global data.
+                                                             MTYPE RW and CC memory will
+                                                             never be stale due to the
+                                                             memory probes.
+
+     fence        acq_rel      - singlethread *none*     *none*
+                               - wavefront
+     fence        acq_rel      - workgroup    *none*     1. s_waitcnt lgkm/vmcnt(0)
+
+                                                           - Use lgkmcnt(0) if not
+                                                             TgSplit execution mode
+                                                             and vmcnt(0) if TgSplit
+                                                             execution mode.
+                                                           - If OpenCL and
+                                                             address space is
+                                                             not generic, omit
+                                                             lgkmcnt(0).
+                                                           - If OpenCL and
+                                                             address space is
+                                                             local, omit
+                                                             vmcnt(0).
+                                                           - However,
+                                                             since LLVM
+                                                             currently has no
+                                                             address space on
+                                                             the fence need to
+                                                             conservatively
+                                                             always generate
+                                                             (see comment for
+                                                             previous fence).
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic
+                                                             load/store/
+                                                             load atomic/store atomic/
+                                                             atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             have
+                                                             completed before
+                                                             performing any
+                                                             following global
+                                                             memory operations.
+                                                           - Ensures that the
+                                                             preceding
+                                                             local/generic load
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             acquire-fence-paired-atomic)
+                                                             has completed
+                                                             before following
+                                                             global memory
+                                                             operations. This
+                                                             satisfies the
+                                                             requirements of
+                                                             acquire.
+                                                           - Ensures that all
+                                                             previous memory
+                                                             operations have
+                                                             completed before a
+                                                             following
+                                                             local/generic store
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             release-fence-paired-atomic).
+                                                             This satisfies the
+                                                             requirements of
+                                                             release.
+                                                           - Must happen before
+                                                             the following
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures that the
+                                                             acquire-fence-paired
+                                                             atomic has completed
+                                                             before invalidating
+                                                             the
+                                                             cache. Therefore
+                                                             any following
+                                                             locations read must
+                                                             be no older than
+                                                             the value read by
+                                                             the
+                                                             acquire-fence-paired-atomic.
+
+                                                         3. buffer_wbinvl1_vol
+
+                                                           - If not TgSplit execution
+                                                             mode, omit.
+                                                           - Ensures that
+                                                             following
+                                                             loads will not see
+                                                             stale data.
+
+     fence        acq_rel      - agent        *none*     1. s_waitcnt lgkmcnt(0) &
+                                                            vmcnt(0)
+
+                                                           - If TgSplit execution mode,
+                                                             omit lgkmcnt(0).
+                                                           - If OpenCL and
+                                                             address space is
+                                                             not generic, omit
+                                                             lgkmcnt(0).
+                                                           - However, since LLVM
+                                                             currently has no
+                                                             address space on
+                                                             the fence need to
+                                                             conservatively
+                                                             always generate
+                                                             (see comment for
+                                                             previous fence).
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0) and
+                                                             s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures that the
+                                                             preceding
+                                                             global/local/generic
+                                                             load
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             acquire-fence-paired-atomic)
+                                                             has completed
+                                                             before invalidating
+                                                             the cache. This
+                                                             satisfies the
+                                                             requirements of
+                                                             acquire.
+                                                           - Ensures that all
+                                                             previous memory
+                                                             operations have
+                                                             completed before a
+                                                             following
+                                                             global/local/generic
+                                                             store
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             release-fence-paired-atomic).
+                                                             This satisfies the
+                                                             requirements of
+                                                             release.
+
+                                                         2. buffer_wbinvl1_vol
+
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following loads
+                                                             will not see stale
+                                                             global data. This
+                                                             satisfies the
+                                                             requirements of
+                                                             acquire.
+
+     fence        acq_rel      - system       *none*     1. buffer_wbl2
+
+                                                           - If OpenCL and
+                                                             address space is
+                                                             local, omit.
+                                                           - Must happen before
+                                                             following s_waitcnt.
+                                                           - Performs L2 writeback to
+                                                             ensure previous
+                                                             global/generic
+                                                             store/atomicrmw are
+                                                             visible at system scope.
+
+                                                         2. s_waitcnt lgkmcnt(0) &
+                                                            vmcnt(0)
+
+                                                           - If TgSplit execution mode,
+                                                             omit lgkmcnt(0).
+                                                           - If OpenCL and
+                                                             address space is
+                                                             not generic, omit
+                                                             lgkmcnt(0).
+                                                           - However, since LLVM
+                                                             currently has no
+                                                             address space on
+                                                             the fence need to
+                                                             conservatively
+                                                             always generate
+                                                             (see comment for
+                                                             previous fence).
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0) and
+                                                             s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following buffer_invl2 and
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures that the
+                                                             preceding
+                                                             global/local/generic
+                                                             load
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             acquire-fence-paired-atomic)
+                                                             has completed
+                                                             before invalidating
+                                                             the cache. This
+                                                             satisfies the
+                                                             requirements of
+                                                             acquire.
+                                                           - Ensures that all
+                                                             previous memory
+                                                             operations have
+                                                             completed before a
+                                                             following
+                                                             global/local/generic
+                                                             store
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             release-fence-paired-atomic).
+                                                             This satisfies the
+                                                             requirements of
+                                                             release.
+
+                                                         3.  buffer_invl2;
+                                                             buffer_wbinvl1_vol
+
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following loads
+                                                             will not see stale
+                                                             MTYPE NC global data.
+                                                             MTYPE RW and CC memory will
+                                                             never be stale due to the
+                                                             memory probes.
+
+     **Sequential Consistent Atomic**
+     ------------------------------------------------------------------------------------
+     load atomic  seq_cst      - singlethread - global   *Same as corresponding
+                               - wavefront    - local    load atomic acquire,
+                                              - generic  except must generated
+                                                         all instructions even
+                                                         for OpenCL.*
+     load atomic  seq_cst      - workgroup    - global   1. s_waitcnt lgkm/vmcnt(0)
+                                              - generic
+                                                           - Use lgkmcnt(0) if not
+                                                             TgSplit execution mode
+                                                             and vmcnt(0) if TgSplit
+                                                             execution mode.
+                                                           - s_waitcnt lgkmcnt(0) must
+                                                             happen after
+                                                             preceding
+                                                             local/generic load
+                                                             atomic/store
+                                                             atomic/atomicrmw
+                                                             with memory
+                                                             ordering of seq_cst
+                                                             and with equal or
+                                                             wider sync scope.
+                                                             (Note that seq_cst
+                                                             fences have their
+                                                             own s_waitcnt
+                                                             lgkmcnt(0) and so do
+                                                             not need to be
+                                                             considered.)
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             preceding
+                                                             global/generic load
+                                                             atomic/store
+                                                             atomic/atomicrmw
+                                                             with memory
+                                                             ordering of seq_cst
+                                                             and with equal or
+                                                             wider sync scope.
+                                                             (Note that seq_cst
+                                                             fences have their
+                                                             own s_waitcnt
+                                                             vmcnt(0) and so do
+                                                             not need to be
+                                                             considered.)
+                                                           - Ensures any
+                                                             preceding
+                                                             sequential
+                                                             consistent global/local
+                                                             memory instructions
+                                                             have completed
+                                                             before executing
+                                                             this sequentially
+                                                             consistent
+                                                             instruction. This
+                                                             prevents reordering
+                                                             a seq_cst store
+                                                             followed by a
+                                                             seq_cst load. (Note
+                                                             that seq_cst is
+                                                             stronger than
+                                                             acquire/release as
+                                                             the reordering of
+                                                             load acquire
+                                                             followed by a store
+                                                             release is
+                                                             prevented by the
+                                                             s_waitcnt of
+                                                             the release, but
+                                                             there is nothing
+                                                             preventing a store
+                                                             release followed by
+                                                             load acquire from
+                                                             completing out of
+                                                             order. The s_waitcnt
+                                                             could be placed after
+                                                             seq_store or before
+                                                             the seq_load. We
+                                                             choose the load to
+                                                             make the s_waitcnt be
+                                                             as late as possible
+                                                             so that the store
+                                                             may have already
+                                                             completed.)
+
+                                                         2. *Following
+                                                            instructions same as
+                                                            corresponding load
+                                                            atomic acquire,
+                                                            except must generated
+                                                            all instructions even
+                                                            for OpenCL.*
+     load atomic  seq_cst      - workgroup    - local    *If TgSplit execution mode,
+                                                         local address space cannot
+                                                         be used.*
+
+                                                         *Same as corresponding
+                                                         load atomic acquire,
+                                                         except must generated
+                                                         all instructions even
+                                                         for OpenCL.*
+
+     load atomic  seq_cst      - agent        - global   1. s_waitcnt lgkmcnt(0) &
+                               - system       - generic     vmcnt(0)
+
+                                                           - If TgSplit execution mode,
+                                                             omit lgkmcnt(0).
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0)
+                                                             and s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             preceding
+                                                             global/generic load
+                                                             atomic/store
+                                                             atomic/atomicrmw
+                                                             with memory
+                                                             ordering of seq_cst
+                                                             and with equal or
+                                                             wider sync scope.
+                                                             (Note that seq_cst
+                                                             fences have their
+                                                             own s_waitcnt
+                                                             lgkmcnt(0) and so do
+                                                             not need to be
+                                                             considered.)
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             preceding
+                                                             global/generic load
+                                                             atomic/store
+                                                             atomic/atomicrmw
+                                                             with memory
+                                                             ordering of seq_cst
+                                                             and with equal or
+                                                             wider sync scope.
+                                                             (Note that seq_cst
+                                                             fences have their
+                                                             own s_waitcnt
+                                                             vmcnt(0) and so do
+                                                             not need to be
+                                                             considered.)
+                                                           - Ensures any
+                                                             preceding
+                                                             sequential
+                                                             consistent global
+                                                             memory instructions
+                                                             have completed
+                                                             before executing
+                                                             this sequentially
+                                                             consistent
+                                                             instruction. This
+                                                             prevents reordering
+                                                             a seq_cst store
+                                                             followed by a
+                                                             seq_cst load. (Note
+                                                             that seq_cst is
+                                                             stronger than
+                                                             acquire/release as
+                                                             the reordering of
+                                                             load acquire
+                                                             followed by a store
+                                                             release is
+                                                             prevented by the
+                                                             s_waitcnt of
+                                                             the release, but
+                                                             there is nothing
+                                                             preventing a store
+                                                             release followed by
+                                                             load acquire from
+                                                             completing out of
+                                                             order. The s_waitcnt
+                                                             could be placed after
+                                                             seq_store or before
+                                                             the seq_load. We
+                                                             choose the load to
+                                                             make the s_waitcnt be
+                                                             as late as possible
+                                                             so that the store
+                                                             may have already
+                                                             completed.)
+
+                                                         2. *Following
+                                                            instructions same as
+                                                            corresponding load
+                                                            atomic acquire,
+                                                            except must generated
+                                                            all instructions even
+                                                            for OpenCL.*
+     store atomic seq_cst      - singlethread - global   *Same as corresponding
+                               - wavefront    - local    store atomic release,
+                               - workgroup    - generic  except must generated
+                               - agent                   all instructions even
+                               - system                  for OpenCL.*
+     atomicrmw    seq_cst      - singlethread - global   *Same as corresponding
+                               - wavefront    - local    atomicrmw acq_rel,
+                               - workgroup    - generic  except must generated
+                               - agent                   all instructions even
+                               - system                  for OpenCL.*
+     fence        seq_cst      - singlethread *none*     *Same as corresponding
+                               - wavefront               fence acq_rel,
+                               - workgroup               except must generated
+                               - agent                   all instructions even
+                               - system                  for OpenCL.*
+     ============ ============ ============== ========== ================================
+
+.. _amdgpu-amdhsa-memory-model-gfx10:
+
+Memory Model GFX10
+++++++++++++++++++
+
+For GFX10:
+
+* Each agent has multiple shader arrays (SA).
+* Each SA has multiple work-group processors (WGP).
+* Each WGP has multiple compute units (CU).
+* Each CU has multiple SIMDs that execute wavefronts.
+* The wavefronts for a single work-group are executed in the same
+  WGP. In CU wavefront execution mode the wavefronts may be executed by
+  
diff erent SIMDs in the same CU. In WGP wavefront execution mode the
+  wavefronts may be executed by 
diff erent SIMDs in 
diff erent CUs in the same
+  WGP.
+* Each WGP has a single LDS memory shared by the wavefronts of the work-groups
+  executing on it.
+* All LDS operations of a WGP are performed as wavefront wide operations in a
+  global order and involve no caching. Completion is reported to a wavefront in
+  execution order.
+* The LDS memory has multiple request queues shared by the SIMDs of a
+  WGP. Therefore, the LDS operations performed by 
diff erent wavefronts of a
+  work-group can be reordered relative to each other, which can result in
+  reordering the visibility of vector memory operations with respect to LDS
+  operations of other wavefronts in the same work-group. A ``s_waitcnt
+  lgkmcnt(0)`` is required to ensure synchronization between LDS operations and
+  vector memory operations between wavefronts of a work-group, but not between
+  operations performed by the same wavefront.
+* The vector memory operations are performed as wavefront wide operations.
+  Completion of load/store/sample operations are reported to a wavefront in
+  execution order of other load/store/sample operations performed by that
+  wavefront.
+* The vector memory operations access a vector L0 cache. There is a single L0
+  cache per CU. Each SIMD of a CU accesses the same L0 cache. Therefore, no
+  special action is required for coherence between the lanes of a single
+  wavefront. However, a ``buffer_gl0_inv`` is required for coherence between
+  wavefronts executing in the same work-group as they may be executing on SIMDs
+  of 
diff erent CUs that access 
diff erent L0s. A ``buffer_gl0_inv`` is also
+  required for coherence between wavefronts executing in 
diff erent work-groups
+  as they may be executing on 
diff erent WGPs.
+* The scalar memory operations access a scalar L0 cache shared by all wavefronts
+  on a WGP. The scalar and vector L0 caches are not coherent. However, scalar
+  operations are used in a restricted way so do not impact the memory model. See
+  :ref:`amdgpu-amdhsa-memory-spaces`.
+* The vector and scalar memory L0 caches use an L1 cache shared by all WGPs on
+  the same SA. Therefore, no special action is required for coherence between
+  the wavefronts of a single work-group. However, a ``buffer_gl1_inv`` is
+  required for coherence between wavefronts executing in 
diff erent work-groups
+  as they may be executing on 
diff erent SAs that access 
diff erent L1s.
+* The L1 caches have independent quadrants to service disjoint ranges of virtual
+  addresses.
+* Each L0 cache has a separate request queue per L1 quadrant. Therefore, the
+  vector and scalar memory operations performed by 
diff erent wavefronts, whether
+  executing in the same or 
diff erent work-groups (which may be executing on
+  
diff erent CUs accessing 
diff erent L0s), can be reordered relative to each
+  other. A ``s_waitcnt vmcnt(0) & vscnt(0)`` is required to ensure
+  synchronization between vector memory operations of 
diff erent wavefronts. It
+  ensures a previous vector memory operation has completed before executing a
+  subsequent vector memory or LDS operation and so can be used to meet the
+  requirements of acquire, release and sequential consistency.
+* The L1 caches use an L2 cache shared by all SAs on the same agent.
+* The L2 cache has independent channels to service disjoint ranges of virtual
+  addresses.
+* Each L1 quadrant of a single SA accesses a 
diff erent L2 channel. Each L1
+  quadrant has a separate request queue per L2 channel. Therefore, the vector
+  and scalar memory operations performed by wavefronts executing in 
diff erent
+  work-groups (which may be executing on 
diff erent SAs) of an agent can be
+  reordered relative to each other. A ``s_waitcnt vmcnt(0) & vscnt(0)`` is
+  required to ensure synchronization between vector memory operations of
+  
diff erent SAs. It ensures a previous vector memory operation has completed
+  before executing a subsequent vector memory and so can be used to meet the
+  requirements of acquire, release and sequential consistency.
+* The L2 cache can be kept coherent with other agents on some targets, or ranges
+  of virtual addresses can be set up to bypass it to ensure system coherence.
+
+Scalar memory operations are only used to access memory that is proven to not
+change during the execution of the kernel dispatch. This includes constant
+address space and global address space for program scope ``const`` variables.
+Therefore, the kernel machine code does not have to maintain the scalar cache to
+ensure it is coherent with the vector caches. The scalar and vector caches are
+invalidated between kernel dispatches by CP since constant address space data
+may change between kernel dispatch executions. See
+:ref:`amdgpu-amdhsa-memory-spaces`.
+
+The one exception is if scalar writes are used to spill SGPR registers. In this
+case the AMDGPU backend ensures the memory location used to spill is never
+accessed by vector memory operations at the same time. If scalar writes are used
+then a ``s_dcache_wb`` is inserted before the ``s_endpgm`` and before a function
+return since the locations may be used for vector memory instructions by a
+future wavefront that uses the same scratch area, or a function call that
+creates a frame at the same address, respectively. There is no need for a
+``s_dcache_inv`` as all scalar writes are write-before-read in the same thread.
+
+For kernarg backing memory:
+
+* CP invalidates the L0 and L1 caches at the start of each kernel dispatch.
+* On dGPU the kernarg backing memory is accessed as MTYPE UC (uncached) to avoid
+  needing to invalidate the L2 cache.
+* On APU the kernarg backing memory is accessed as MTYPE CC (cache coherent) and
+  so the L2 cache will be coherent with the CPU and other agents.
+
+Scratch backing memory (which is used for the private address space) is accessed
+with MTYPE NC (non-coherent). Since the private address space is only accessed
+by a single thread, and is always write-before-read, there is never a need to
+invalidate these entries from the L0 or L1 caches.
+
+Wavefronts are executed in native mode with in-order reporting of loads and
+sample instructions. In this mode vmcnt reports completion of load, atomic with
+return and sample instructions in order, and the vscnt reports the completion of
+store and atomic without return in order. See ``MEM_ORDERED`` field in
+:ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx10-table`.
+
+Wavefronts can be executed in WGP or CU wavefront execution mode:
+
+* In WGP wavefront execution mode the wavefronts of a work-group are executed
+  on the SIMDs of both CUs of the WGP. Therefore, explicit management of the per
+  CU L0 caches is required for work-group synchronization. Also accesses to L1
+  at work-group scope need to be explicitly ordered as the accesses from
+  
diff erent CUs are not ordered.
+* In CU wavefront execution mode the wavefronts of a work-group are executed on
+  the SIMDs of a single CU of the WGP. Therefore, all global memory access by
+  the work-group access the same L0 which in turn ensures L1 accesses are
+  ordered and so do not require explicit management of the caches for
+  work-group synchronization.
+
+See ``WGP_MODE`` field in
+:ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx10-table` and
+:ref:`amdgpu-target-features`.
+
+The code sequences used to implement the memory model for GFX10 are defined in
+table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-table`.
+
+  .. table:: AMDHSA Memory Model Code Sequences GFX10
+     :name: amdgpu-amdhsa-memory-model-code-sequences-gfx10-table
+
+     ============ ============ ============== ========== ================================
+     LLVM Instr   LLVM Memory  LLVM Memory    AMDGPU     AMDGPU Machine Code
+                  Ordering     Sync Scope     Address    GFX10
+                                              Space
+     ============ ============ ============== ========== ================================
+     **Non-Atomic**
+     ------------------------------------------------------------------------------------
+     load         *none*       *none*         - global   - !volatile & !nontemporal
+                                              - generic
+                                              - private    1. buffer/global/flat_load
+                                              - constant
+                                                         - !volatile & nontemporal
+
+                                                           1. buffer/global/flat_load
+                                                              slc=1
+
+                                                         - volatile
+
+                                                           1. buffer/global/flat_load
+                                                              glc=1 dlc=1
+                                                           2. s_waitcnt vmcnt(0)
+
+                                                            - Must happen before
+                                                              any following volatile
+                                                              global/generic
+                                                              load/store.
+                                                            - Ensures that
+                                                              volatile
+                                                              operations to
+                                                              
diff erent
+                                                              addresses will not
+                                                              be reordered by
+                                                              hardware.
+
+     load         *none*       *none*         - local    1. ds_load
+     store        *none*       *none*         - global   - !volatile & !nontemporal
+                                              - generic
+                                              - private    1. buffer/global/flat_store
+                                              - constant
+                                                         - !volatile & nontemporal
+
+                                                            1. buffer/global/flat_store
+                                                               slc=1
+
+                                                         - volatile
+
+                                                            1. buffer/global/flat_store
+                                                            2. s_waitcnt vscnt(0)
+
+                                                            - Must happen before
+                                                              any following volatile
+                                                              global/generic
+                                                              load/store.
+                                                            - Ensures that
+                                                              volatile
+                                                              operations to
+                                                              
diff erent
+                                                              addresses will not
+                                                              be reordered by
+                                                              hardware.
+
+     store        *none*       *none*         - local    1. ds_store
+     **Unordered Atomic**
+     ------------------------------------------------------------------------------------
+     load atomic  unordered    *any*          *any*      *Same as non-atomic*.
+     store atomic unordered    *any*          *any*      *Same as non-atomic*.
+     atomicrmw    unordered    *any*          *any*      *Same as monotonic atomic*.
+     **Monotonic Atomic**
+     ------------------------------------------------------------------------------------
+     load atomic  monotonic    - singlethread - global   1. buffer/global/flat_load
+                               - wavefront    - generic
+     load atomic  monotonic    - workgroup    - global   1. buffer/global/flat_load
+                                              - generic     glc=1
+
+                                                           - If CU wavefront execution
+                                                             mode, omit glc=1.
+
+     load atomic  monotonic    - singlethread - local    1. ds_load
+                               - wavefront
+                               - workgroup
+     load atomic  monotonic    - agent        - global   1. buffer/global/flat_load
+                               - system       - generic     glc=1 dlc=1
+     store atomic monotonic    - singlethread - global   1. buffer/global/flat_store
+                               - wavefront    - generic
+                               - workgroup
+                               - agent
+                               - system
+     store atomic monotonic    - singlethread - local    1. ds_store
+                               - wavefront
+                               - workgroup
+     atomicrmw    monotonic    - singlethread - global   1. buffer/global/flat_atomic
+                               - wavefront    - generic
+                               - workgroup
+                               - agent
+                               - system
+     atomicrmw    monotonic    - singlethread - local    1. ds_atomic
+                               - wavefront
+                               - workgroup
+     **Acquire Atomic**
+     ------------------------------------------------------------------------------------
+     load atomic  acquire      - singlethread - global   1. buffer/global/ds/flat_load
+                               - wavefront    - local
+                                              - generic
+     load atomic  acquire      - workgroup    - global   1. buffer/global_load glc=1
+
+                                                           - If CU wavefront execution
+                                                             mode, omit glc=1.
+
+                                                         2. s_waitcnt vmcnt(0)
+
+                                                           - If CU wavefront execution
+                                                             mode, omit.
+                                                           - Must happen before
+                                                             the following buffer_gl0_inv
+                                                             and before any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+
+                                                         3. buffer_gl0_inv
+
+                                                           - If CU wavefront execution
+                                                             mode, omit.
+                                                           - Ensures that
+                                                             following
+                                                             loads will not see
+                                                             stale data.
+
+     load atomic  acquire      - workgroup    - local    1. ds_load
+                                                         2. s_waitcnt lgkmcnt(0)
+
+                                                           - If OpenCL, omit.
+                                                           - Must happen before
+                                                             the following buffer_gl0_inv
+                                                             and before any following
+                                                             global/generic load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures any
+                                                             following global
+                                                             data read is no
+                                                             older than the local load
+                                                             atomic value being
+                                                             acquired.
+
+                                                         3. buffer_gl0_inv
+
+                                                           - If CU wavefront execution
+                                                             mode, omit.
+                                                           - If OpenCL, omit.
+                                                           - Ensures that
+                                                             following
+                                                             loads will not see
+                                                             stale data.
+
+     load atomic  acquire      - workgroup    - generic  1. flat_load glc=1
+
+                                                           - If CU wavefront execution
+                                                             mode, omit glc=1.
+
+                                                         2. s_waitcnt lgkmcnt(0) &
+                                                            vmcnt(0)
+
+                                                           - If CU wavefront execution
+                                                             mode, omit vmcnt(0).
+                                                           - If OpenCL, omit
+                                                             lgkmcnt(0).
+                                                           - Must happen before
+                                                             the following
+                                                             buffer_gl0_inv and any
+                                                             following global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures any
+                                                             following global
+                                                             data read is no
+                                                             older than a local load
+                                                             atomic value being
+                                                             acquired.
+
+                                                         3. buffer_gl0_inv
+
+                                                           - If CU wavefront execution
+                                                             mode, omit.
+                                                           - Ensures that
+                                                             following
+                                                             loads will not see
+                                                             stale data.
+
+     load atomic  acquire      - agent        - global   1. buffer/global_load
+                               - system                     glc=1 dlc=1
+                                                         2. s_waitcnt vmcnt(0)
+
+                                                           - Must happen before
+                                                             following
+                                                             buffer_gl*_inv.
+                                                           - Ensures the load
+                                                             has completed
+                                                             before invalidating
+                                                             the caches.
+
+                                                         3. buffer_gl0_inv;
+                                                            buffer_gl1_inv
+
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following
+                                                             loads will not see
+                                                             stale global data.
+
+     load atomic  acquire      - agent        - generic  1. flat_load glc=1 dlc=1
+                               - system                  2. s_waitcnt vmcnt(0) &
+                                                            lgkmcnt(0)
+
+                                                           - If OpenCL omit
+                                                             lgkmcnt(0).
+                                                           - Must happen before
+                                                             following
+                                                             buffer_gl*_invl.
+                                                           - Ensures the flat_load
+                                                             has completed
+                                                             before invalidating
+                                                             the caches.
+
+                                                         3. buffer_gl0_inv;
+                                                            buffer_gl1_inv
+
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following loads
+                                                             will not see stale
+                                                             global data.
+
+     atomicrmw    acquire      - singlethread - global   1. buffer/global/ds/flat_atomic
+                               - wavefront    - local
                                               - generic
      atomicrmw    acquire      - workgroup    - global   1. buffer/global_atomic
                                                          2. s_waitcnt vm/vscnt(0)
@@ -9099,6 +11549,8 @@ default value for all keys is 0, with the following exceptions:
 - *kernarg_segment_alignment*, *group_segment_alignment*, and
   *private_segment_alignment* default to 4. Note that alignments are specified
   as a power of 2, so a value of **n** means an alignment of 2^ **n**.
+- *enable_tg_split* defaults to 1 if target feature ``tgsplit`` is enabled for
+  GFX90A onwards.
 - *enable_wgp_mode* defaults to 1 if target feature ``cumode`` is disabled for
   GFX10 onwards.
 - *enable_mem_ordered* defaults to 1 for GFX10 onwards.
@@ -9312,6 +11764,9 @@ terminated by an ``.end_amdhsa_kernel`` directive.
      ``.amdhsa_next_free_sgpr``                               Required            GFX6-GFX10   Maximum SGPR number explicitly referenced, plus one.
                                                                                                Used to calculate GRANULATED_WAVEFRONT_SGPR_COUNT in
                                                                                                :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx10-table`.
+     ``.amdhsa_accum_offset``                                 Required            GFX90A       Offset of a first AccVGPR in the unified register file.
+                                                                                               Used to calculate ACCUM_OFFSET in
+                                                                                               :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx90a-table`.
      ``.amdhsa_reserve_vcc``                                  1                   GFX6-GFX10   Whether the kernel may use the special VCC SGPR.
                                                                                                Used to calculate GRANULATED_WAVEFRONT_SGPR_COUNT in
                                                                                                :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx10-table`.
@@ -9345,6 +11800,10 @@ terminated by an ``.end_amdhsa_kernel`` directive.
                                                                                                :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx10-table`.
      ``.amdhsa_fp16_overflow``                                0                   GFX9-GFX10   Controls FP16_OVFL in
                                                                                                :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx10-table`.
+     ``.amdhsa_tg_split``                                     Target              GFX90A       Controls TG_SPLIT in
+                                                              Feature                          :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx90a-table`.
+                                                              Specific
+                                                              (tgsplit)
      ``.amdhsa_workgroup_processor_mode``                     Target              GFX10        Controls ENABLE_WGP_MODE in
                                                               Feature                          :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`.
                                                               Specific

diff  --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index a33b6f37e62a..e6bf57f35eaa 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -719,6 +719,7 @@ enum : unsigned {
   EF_AMDGPU_MACH_AMDGCN_GFX906        = 0x02f,
   EF_AMDGPU_MACH_AMDGCN_GFX908        = 0x030,
   EF_AMDGPU_MACH_AMDGCN_GFX909        = 0x031,
+  EF_AMDGPU_MACH_AMDGCN_GFX90A        = 0x03f,
   EF_AMDGPU_MACH_AMDGCN_GFX90C        = 0x032,
   EF_AMDGPU_MACH_AMDGCN_GFX1010       = 0x033,
   EF_AMDGPU_MACH_AMDGCN_GFX1011       = 0x034,
@@ -733,7 +734,7 @@ enum : unsigned {
 
   // First/last AMDGCN-based processors.
   EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
-  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX805,
+  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX90A,
 
   // Indicates if the "xnack" target feature is enabled for all code contained
   // in the object.

diff  --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 30bdef28230b..088aadcfad25 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1029,6 +1029,10 @@ def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
 // gfx908 intrinsic
 def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
 
+// gfx90a intrinsics
+def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
+def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
+
 class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic <
   !if(NoRtn, [], [data_ty]),
   [!if(NoRtn, data_ty, LLVMMatchType<0>),  // vdata(VGPR)
@@ -1066,6 +1070,10 @@ def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
 // gfx908 intrinsic
 def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
 
+// gfx90a intrinsics
+def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
+def int_amdgcn_struct_buffer_atomic_fmax : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
+
 
 // Obsolescent tbuffer intrinsics.
 def int_amdgcn_tbuffer_load : Intrinsic <
@@ -1995,6 +2003,65 @@ def int_amdgcn_mfma_f32_16x16x8bf16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x1
             [IntrConvergent, IntrNoMem, IntrWillReturn,
              ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
+//===----------------------------------------------------------------------===//
+// gfx90a intrinsics
+// ===----------------------------------------------------------------------===//
+
+def int_amdgcn_global_atomic_fmin : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_global_atomic_fmax : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_flat_atomic_fadd   : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_flat_atomic_fmin   : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_flat_atomic_fmax   : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
+
+def int_amdgcn_mfma_f32_32x32x4bf16_1k : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x4bf16_1k">,
+  Intrinsic<[llvm_v32f32_ty],
+            [llvm_v4i16_ty, llvm_v4i16_ty, llvm_v32f32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem,
+             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
+
+def int_amdgcn_mfma_f32_16x16x4bf16_1k : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x4bf16_1k">,
+  Intrinsic<[llvm_v16f32_ty],
+            [llvm_v4i16_ty, llvm_v4i16_ty, llvm_v16f32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem,
+             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
+
+def int_amdgcn_mfma_f32_4x4x4bf16_1k : GCCBuiltin<"__builtin_amdgcn_mfma_f32_4x4x4bf16_1k">,
+  Intrinsic<[llvm_v4f32_ty],
+            [llvm_v4i16_ty, llvm_v4i16_ty, llvm_v4f32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem,
+             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
+
+def int_amdgcn_mfma_f32_32x32x8bf16_1k : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x8bf16_1k">,
+  Intrinsic<[llvm_v16f32_ty],
+            [llvm_v4i16_ty, llvm_v4i16_ty, llvm_v16f32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem,
+             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
+
+def int_amdgcn_mfma_f32_16x16x16bf16_1k : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x16bf16_1k">,
+  Intrinsic<[llvm_v4f32_ty],
+            [llvm_v4i16_ty, llvm_v4i16_ty, llvm_v4f32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem,
+             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
+
+def int_amdgcn_mfma_f64_16x16x4f64 : GCCBuiltin<"__builtin_amdgcn_mfma_f64_16x16x4f64">,
+  Intrinsic<[llvm_v4f64_ty],
+            [llvm_double_ty, llvm_double_ty, llvm_v4f64_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem,
+             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
+
+def int_amdgcn_mfma_f64_4x4x4f64 : GCCBuiltin<"__builtin_amdgcn_mfma_f64_4x4x4f64">,
+  Intrinsic<[llvm_double_ty],
+            [llvm_double_ty, llvm_double_ty, llvm_double_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem,
+             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
+
 //===----------------------------------------------------------------------===//
 // Special Intrinsics for backend internal use only. No frontend
 // should emit calls to these.

diff  --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
index bd84da43dff7..7177557b6ff0 100644
--- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
+++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
@@ -122,14 +122,27 @@ enum : int32_t {
 };
 #undef COMPUTE_PGM_RSRC2
 
-// Compute program resource register 3. Must match hardware definition.
-#define COMPUTE_PGM_RSRC3(NAME, SHIFT, WIDTH) \
-  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_ ## NAME, SHIFT, WIDTH)
+// Compute program resource register 3 for GFX90A+. Must match hardware
+// definition.
+#define COMPUTE_PGM_RSRC3_GFX90A(NAME, SHIFT, WIDTH) \
+  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX90A_ ## NAME, SHIFT, WIDTH)
 enum : int32_t {
-  COMPUTE_PGM_RSRC3(SHARED_VGPR_COUNT, 0, 4), // GFX10+
-  COMPUTE_PGM_RSRC3(RESERVED0, 4, 28),
+  COMPUTE_PGM_RSRC3_GFX90A(ACCUM_OFFSET, 0, 6),
+  COMPUTE_PGM_RSRC3_GFX90A(RESERVED0, 6, 10),
+  COMPUTE_PGM_RSRC3_GFX90A(TG_SPLIT, 16, 1),
+  COMPUTE_PGM_RSRC3_GFX90A(RESERVED1, 17, 15),
 };
-#undef COMPUTE_PGM_RSRC3
+#undef COMPUTE_PGM_RSRC3_GFX90A
+
+// Compute program resource register 3 for GFX10+. Must match hardware
+// definition.
+#define COMPUTE_PGM_RSRC3_GFX10(NAME, SHIFT, WIDTH) \
+  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_ ## NAME, SHIFT, WIDTH)
+enum : int32_t {
+  COMPUTE_PGM_RSRC3_GFX10(SHARED_VGPR_COUNT, 0, 4), // GFX10+
+  COMPUTE_PGM_RSRC3_GFX10(RESERVED0, 4, 28),
+};
+#undef COMPUTE_PGM_RSRC3_GFX10
 
 // Kernel code properties. Must be kept backwards compatible.
 #define KERNEL_CODE_PROPERTY(NAME, SHIFT, WIDTH) \
@@ -155,7 +168,7 @@ struct kernel_descriptor_t {
   uint8_t reserved0[8];
   int64_t kernel_code_entry_byte_offset;
   uint8_t reserved1[20];
-  uint32_t compute_pgm_rsrc3; // GFX10+
+  uint32_t compute_pgm_rsrc3; // GFX10+ and GFX90A+
   uint32_t compute_pgm_rsrc1;
   uint32_t compute_pgm_rsrc2;
   uint16_t kernel_code_properties;

diff  --git a/llvm/include/llvm/Support/TargetParser.h b/llvm/include/llvm/Support/TargetParser.h
index 450e713f27f2..03d4e65d9562 100644
--- a/llvm/include/llvm/Support/TargetParser.h
+++ b/llvm/include/llvm/Support/TargetParser.h
@@ -83,7 +83,8 @@ enum GPUKind : uint32_t {
   GK_GFX906 = 63,
   GK_GFX908 = 64,
   GK_GFX909 = 65,
-  GK_GFX90C = 66,
+  GK_GFX90A = 66,
+  GK_GFX90C = 67,
 
   GK_GFX1010 = 71,
   GK_GFX1011 = 72,

diff  --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index faf614a1a97a..8285e3d9e659 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -457,6 +457,8 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const {
     return "gfx908";
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909:
     return "gfx909";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A:
+    return "gfx90a";
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C:
     return "gfx90c";
 

diff  --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index 1eedc646e820..8267ac9c596d 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -535,6 +535,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX906, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX908, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX909, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX90A, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX90C, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1010, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1011, EF_AMDGPU_MACH);

diff  --git a/llvm/lib/Support/TargetParser.cpp b/llvm/lib/Support/TargetParser.cpp
index 3ccdc55b305d..abc2f68a60a3 100644
--- a/llvm/lib/Support/TargetParser.cpp
+++ b/llvm/lib/Support/TargetParser.cpp
@@ -104,6 +104,7 @@ constexpr GPUInfo AMDGCNGPUs[] = {
   {{"gfx906"},    {"gfx906"},  GK_GFX906,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
   {{"gfx908"},    {"gfx908"},  GK_GFX908,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
   {{"gfx909"},    {"gfx909"},  GK_GFX909,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+  {{"gfx90a"},    {"gfx90a"},  GK_GFX90A,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
   {{"gfx90c"},    {"gfx90c"},  GK_GFX90C,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
   {{"gfx1010"},   {"gfx1010"}, GK_GFX1010, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK},
   {{"gfx1011"},   {"gfx1011"}, GK_GFX1011, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK},
@@ -213,6 +214,7 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) {
   case GK_GFX906:  return {9, 0, 6};
   case GK_GFX908:  return {9, 0, 8};
   case GK_GFX909:  return {9, 0, 9};
+  case GK_GFX90A:  return {9, 0, 10};
   case GK_GFX90C:  return {9, 0, 12};
   case GK_GFX1010: return {10, 1, 0};
   case GK_GFX1011: return {10, 1, 1};

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index c352c0097c5c..8cc0d2b4e78b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -51,6 +51,12 @@ def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops",
   "Most fp64 instructions are half rate instead of quarter"
 >;
 
+def FullRate64Ops : SubtargetFeature<"full-rate-64-ops",
+  "FullRate64Ops",
+  "true",
+  "Most fp64 instructions are full rate"
+>;
+
 def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
   "FlatAddressSpace",
   "true",
@@ -148,6 +154,12 @@ def FeatureXNACK : SubtargetFeature<"xnack",
   "Enable XNACK support"
 >;
 
+def FeatureTgSplit : SubtargetFeature<"tgsplit",
+  "EnableTgSplit",
+  "true",
+  "Enable threadgroup split execution"
+>;
+
 def FeatureCuMode : SubtargetFeature<"cumode",
   "EnableCuMode",
   "true",
@@ -272,6 +284,12 @@ def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts",
   "Additional instructions for GFX9+"
 >;
 
+def FeatureGFX90AInsts : SubtargetFeature<"gfx90a-insts",
+  "GFX90AInsts",
+  "true",
+  "Additional instructions for GFX90A+"
+>;
+
 def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts",
   "GFX10Insts",
   "true",
@@ -387,6 +405,18 @@ def FeatureDPP8 : SubtargetFeature<"dpp8",
   "Support DPP8 (Data Parallel Primitives) extension"
 >;
 
+def Feature64BitDPP : SubtargetFeature<"dpp-64bit",
+  "Has64BitDPP",
+  "true",
+  "Support DPP (Data Parallel Primitives) extension"
+>;
+
+def FeaturePackedFP32Ops : SubtargetFeature<"packed-fp32-ops",
+  "HasPackedFP32Ops",
+  "true",
+  "Support packed fp32 instructions"
+>;
+
 def FeatureR128A16 : SubtargetFeature<"r128-a16",
   "HasR128A16",
   "true",
@@ -411,6 +441,12 @@ def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding",
   "Support NSA encoding for image instructions"
 >;
 
+def FeatureExtendedImageInsts : SubtargetFeature<"extended-image-insts",
+  "HasExtendedImageInsts",
+  "true",
+  "Support mips != 0, lod != 0, gather4, and get_lod"
+>;
+
 def FeatureGFX10_BEncoding : SubtargetFeature<"gfx10_b-encoding",
   "GFX10_BEncoding",
   "true",
@@ -659,6 +695,12 @@ def FeatureUnalignedAccessMode : SubtargetFeature<"unaligned-access-mode",
   " supports it"
 >;
 
+def FeaturePackedTID : SubtargetFeature<"packed-tid",
+  "HasPackedTID",
+  "true",
+  "Workitem IDs are packed into v0 at kernel launch"
+>;
+
 // Dummy feature used to disable assembler instructions.
 def FeatureDisable : SubtargetFeature<"",
   "FeatureDisable","true",
@@ -675,7 +717,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
   [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128,
   FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
   FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
-  FeatureTrigReducedRange]
+  FeatureTrigReducedRange, FeatureExtendedImageInsts
+  ]
 >;
 
 def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
@@ -684,7 +727,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
   FeatureWavefrontSize64, FeatureFlatAddressSpace,
   FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
   FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
-  FeatureDsSrc2Insts, FeatureUnalignedBufferAccess]
+  FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess
+  ]
 >;
 
 def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
@@ -697,7 +741,9 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
    FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP,
    FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts,
    FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
-   FeatureDsSrc2Insts, FeatureFastDenormalF32, FeatureUnalignedBufferAccess]
+   FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32,
+   FeatureUnalignedBufferAccess
+  ]
 >;
 
 def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
@@ -712,9 +758,9 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
    FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
    FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts,
    FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
-   FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts,
-   FeatureFastDenormalF32, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
-   FeatureSupportsXNACK]
+   FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
+   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess
+  ]
 >;
 
 def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
@@ -729,7 +775,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
    FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
    FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts,
    FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking,
-   FeatureVOP3Literal, FeatureDPP8,
+   FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
    FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
    FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16,
    FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess
@@ -816,17 +862,26 @@ def FeatureISAVersion9_0_0 : FeatureSet<
   [FeatureGFX9,
    FeatureMadMixInsts,
    FeatureLDSBankCount32,
+   FeatureDsSrc2Insts,
+   FeatureExtendedImageInsts,
+   FeatureMadMacF32Insts,
    FeatureImageGather4D16Bug]>;
 
 def FeatureISAVersion9_0_2 : FeatureSet<
   [FeatureGFX9,
    FeatureMadMixInsts,
    FeatureLDSBankCount32,
+   FeatureDsSrc2Insts,
+   FeatureExtendedImageInsts,
+   FeatureMadMacF32Insts,
    FeatureImageGather4D16Bug]>;
 
 def FeatureISAVersion9_0_4 : FeatureSet<
   [FeatureGFX9,
    FeatureLDSBankCount32,
+   FeatureDsSrc2Insts,
+   FeatureExtendedImageInsts,
+   FeatureMadMacF32Insts,
    FeatureFmaMixInsts,
    FeatureImageGather4D16Bug]>;
 
@@ -835,6 +890,9 @@ def FeatureISAVersion9_0_6 : FeatureSet<
    HalfRate64Ops,
    FeatureFmaMixInsts,
    FeatureLDSBankCount32,
+   FeatureDsSrc2Insts,
+   FeatureExtendedImageInsts,
+   FeatureMadMacF32Insts,
    FeatureDLInsts,
    FeatureDot1Insts,
    FeatureDot2Insts,
@@ -846,6 +904,9 @@ def FeatureISAVersion9_0_8 : FeatureSet<
    HalfRate64Ops,
    FeatureFmaMixInsts,
    FeatureLDSBankCount32,
+   FeatureDsSrc2Insts,
+   FeatureExtendedImageInsts,
+   FeatureMadMacF32Insts,
    FeatureDLInsts,
    FeatureDot1Insts,
    FeatureDot2Insts,
@@ -864,13 +925,40 @@ def FeatureISAVersion9_0_9 : FeatureSet<
   [FeatureGFX9,
    FeatureMadMixInsts,
    FeatureLDSBankCount32,
+   FeatureDsSrc2Insts,
+   FeatureExtendedImageInsts,
+   FeatureMadMacF32Insts,
    FeatureImageGather4D16Bug]>;
 
+def FeatureISAVersion9_0_A : FeatureSet<
+  [FeatureGFX9,
+   FeatureGFX90AInsts,
+   FeatureFmaMixInsts,
+   FeatureLDSBankCount32,
+   FeatureDLInsts,
+   FeatureDot1Insts,
+   FeatureDot2Insts,
+   FeatureDot3Insts,
+   FeatureDot4Insts,
+   FeatureDot5Insts,
+   FeatureDot6Insts,
+   Feature64BitDPP,
+   FeaturePackedFP32Ops,
+   FeatureMAIInsts,
+   FeaturePkFmacF16Inst,
+   FeatureAtomicFaddInsts,
+   FeatureMadMacF32Insts,
+   FeatureSupportsSRAMECC,
+   FeaturePackedTID,
+   FullRate64Ops]>;
+
 def FeatureISAVersion9_0_C : FeatureSet<
   [FeatureGFX9,
    FeatureMadMixInsts,
    FeatureLDSBankCount32,
    FeatureXNACK,
+   FeatureDsSrc2Insts,
+   FeatureExtendedImageInsts,
    FeatureImageGather4D16Bug]>;
 
 // TODO: Organize more features into groups.
@@ -1077,6 +1165,14 @@ def isGFX6GFX7GFX8GFX9 :
             "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
   AssemblerPredicate<(all_of (not FeatureGFX10Insts))>;
 
+def isGFX6GFX7GFX8GFX9NotGFX90A :
+  Predicate<"!Subtarget->hasGFX90AInsts() &&"
+            "(Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
+            " Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
+            " Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
+            " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
+  AssemblerPredicate<(all_of (not FeatureGFX10Insts), (not FeatureGFX90AInsts))>;
+
 def isGFX7Plus :
   Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
   AssemblerPredicate<(all_of FeatureCIInsts)>;
@@ -1097,6 +1193,28 @@ def isGFX9Only : Predicate <
   "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
   AssemblerPredicate<(all_of FeatureGCN3Encoding, FeatureGFX9Insts)>;
 
+def isGCN3ExcludingGFX90A :
+  Predicate<"Subtarget->isGCN3Encoding() && !Subtarget->hasGFX90AInsts()">,
+  AssemblerPredicate<(all_of FeatureGCN3Encoding, (not FeatureGFX90AInsts))>;
+
+def isGFX90APlus :
+  Predicate<"Subtarget->hasGFX90AInsts()">,
+  AssemblerPredicate<(all_of FeatureGFX90AInsts)>;
+
+def isNotGFX90APlus :
+  Predicate<"!Subtarget->hasGFX90AInsts()">,
+  AssemblerPredicate<(all_of (not FeatureGFX90AInsts))>;
+
+def isGFX8GFX9NotGFX90A :
+  Predicate<"!Subtarget->hasGFX90AInsts() &&"
+            "(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
+            " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
+  AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding, (not FeatureGFX90AInsts))>;
+
+def isGFX90AOnly :
+  Predicate<"Subtarget->hasGFX90AInsts()">,
+  AssemblerPredicate<(all_of FeatureGFX90AInsts)>;
+
 def isGFX8GFX9 :
   Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
             "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
@@ -1177,6 +1295,15 @@ def HasDPP : Predicate<"Subtarget->hasDPP()">,
 def HasDPP8 : Predicate<"Subtarget->hasDPP8()">,
   AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP8)>;
 
+def Has64BitDPP : Predicate<"Subtarget->has64BitDPP()">,
+  AssemblerPredicate<(all_of Feature64BitDPP)>;
+
+def HasPackedFP32Ops : Predicate<"Subtarget->hasPackedFP32Ops()">,
+  AssemblerPredicate<(all_of FeaturePackedFP32Ops)>;
+
+def HasExtendedImageInsts : Predicate<"Subtarget->hasExtendedImageInsts()">,
+  AssemblerPredicate<(all_of FeatureExtendedImageInsts)>;
+
 def HasR128A16 : Predicate<"Subtarget->hasR128A16()">,
   AssemblerPredicate<(all_of FeatureR128A16)>;
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 139ac3bab14c..e9ed45d8cd14 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -143,7 +143,8 @@ struct AMDGPUFunctionArgInfo {
   // Input registers for non-HSA ABI
   ArgDescriptor ImplicitBufferPtr;
 
-  // VGPRs inputs. These are always v0, v1 and v2 for entry functions.
+  // VGPRs inputs. For entry functions these are either v0, v1 and v2 or packed
+  // into v0, 10 bits per dimension if packed-tid is set.
   ArgDescriptor WorkItemIDX;
   ArgDescriptor WorkItemIDY;
   ArgDescriptor WorkItemIDZ;

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index c655e5ec87b7..ed5ed98033a5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -328,11 +328,11 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
   // causing stale data in caches. Arguably this should be done by the linker,
   // which is why this isn't done for Mesa.
   const MCSubtargetInfo &STI = *getGlobalSTI();
-  if (AMDGPU::isGFX10Plus(STI) &&
+  if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
       (STI.getTargetTriple().getOS() == Triple::AMDHSA ||
        STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
     OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
-    getTargetStreamer()->EmitCodeEnd();
+    getTargetStreamer()->EmitCodeEnd(STI);
   }
 
   return AsmPrinter::doFinalization(M);
@@ -400,6 +400,7 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
 amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
     const MachineFunction &MF,
     const SIProgramInfo &PI) const {
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   amdhsa::kernel_descriptor_t KernelDescriptor;
   memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
 
@@ -413,6 +414,11 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
   KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2;
   KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
 
+  assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
+  if (STM.hasGFX90AInsts())
+    KernelDescriptor.compute_pgm_rsrc3 =
+      CurrentProgramInfo.ComputePGMRSrc3GFX90A;
+
   return KernelDescriptor;
 }
 
@@ -521,6 +527,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
       " NumVGPRsForWavesPerEU: " +
       Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
 
+    if (STM.hasGFX90AInsts())
+      OutStreamer->emitRawComment(
+        " AccumOffset: " +
+        Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false);
+
     OutStreamer->emitRawComment(
       " Occupancy: " +
       Twine(CurrentProgramInfo.Occupancy), false);
@@ -550,6 +561,21 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
       " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
       Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
       false);
+
+    assert(STM.hasGFX90AInsts() ||
+           CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
+    if (STM.hasGFX90AInsts()) {
+      OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
+        Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
+                               amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
+                               false);
+      OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
+        Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
+                               amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
+                               false);
+    }
   }
 
   if (DumpCodeInstEmitter) {
@@ -612,6 +638,8 @@ int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
 
 int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs(
   const GCNSubtarget &ST) const {
+  if (ST.hasGFX90AInsts() && NumAGPR)
+    return alignTo(NumVGPR, 4) + NumAGPR;
   return std::max(NumVGPR, NumAGPR);
 }
 
@@ -985,6 +1013,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.NumArchVGPR = Info.NumVGPR;
   ProgInfo.NumAccVGPR = Info.NumAGPR;
   ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
+  ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
+  ProgInfo.TgSplit = STM.isTgSplitEnabled();
   ProgInfo.NumSGPR = Info.NumExplicitSGPR;
   ProgInfo.ScratchSize = Info.PrivateSegmentSize;
   ProgInfo.VCCUsed = Info.UsesVCC;
@@ -1163,6 +1193,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
       S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
       S_00B84C_EXCP_EN(0);
 
+  if (STM.hasGFX90AInsts()) {
+    AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
+                    amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
+                    ProgInfo.AccumOffset);
+    AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
+                    amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
+                    ProgInfo.TgSplit);
+  }
+
   ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize,
                                             ProgInfo.NumSGPRsForWavesPerEU,
                                             ProgInfo.NumVGPRsForWavesPerEU);

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 02944154924e..f25019b167ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -163,6 +163,10 @@ def CSR_AMDGPU_VGPRs : CalleeSavedRegs<
     (sequence "VGPR%u", 248, 255))
 >;
 
+def CSR_AMDGPU_AGPRs_32_255 : CalleeSavedRegs<
+  (sequence "AGPR%u", 32, 255)
+>;
+
 def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs<
   (sequence "SGPR%u", 32, 105)
 >;
@@ -172,6 +176,13 @@ def CSR_AMDGPU_AllVGPRs : CalleeSavedRegs<
   (sequence "VGPR%u", 0, 255)
 >;
 
+def CSR_AMDGPU_AllAGPRs : CalleeSavedRegs<
+  (sequence "AGPR%u", 0, 255)
+>;
+def CSR_AMDGPU_AllVectorRegs : CalleeSavedRegs<
+  (add CSR_AMDGPU_AllVGPRs, CSR_AMDGPU_AllAGPRs)
+>;
+
 // Just to get the regmask, not for calling convention purposes.
 def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs<
   (add (sequence "SGPR%u", 0, 105), VCC_LO, VCC_HI)
@@ -181,6 +192,10 @@ def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
   (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs_32_105)
 >;
 
+def CSR_AMDGPU_HighRegs_With_AGPRs : CalleeSavedRegs<
+  (add CSR_AMDGPU_HighRegs, CSR_AMDGPU_AGPRs_32_255)
+>;
+
 def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>;
 
 // Calling convention for leaf functions

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 820aa42c65b7..38f3bb900c84 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -219,6 +219,8 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
 def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>;
 
@@ -338,5 +340,8 @@ def gi_extract_dlc : GICustomOperandRenderer<"renderExtractDLC">,
 def gi_extract_swz : GICustomOperandRenderer<"renderExtractSWZ">,
   GISDNodeXFormEquiv<extract_swz>;
 
+def gi_extract_sccb : GICustomOperandRenderer<"renderExtractSCCB">,
+  GISDNodeXFormEquiv<extract_sccb>;
+
 def gi_frameindex_to_targetframeindex : GICustomOperandRenderer<"renderFrameIndex">,
   GISDNodeXFormEquiv<frameindex_to_targetframeindex>;

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 340f4ac6f57a..c03cde0a6d54 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -189,11 +189,12 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
                    SDValue &SOffset, SDValue &Offset, SDValue &Offen,
                    SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
-                   SDValue &TFE, SDValue &DLC, SDValue &SWZ) const;
+                   SDValue &TFE, SDValue &DLC, SDValue &SWZ,
+                   SDValue &SCCB) const;
   bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
                          SDValue &SOffset, SDValue &Offset, SDValue &GLC,
                          SDValue &SLC, SDValue &TFE, SDValue &DLC,
-                         SDValue &SWZ) const;
+                         SDValue &SWZ, SDValue &SCCB) const;
   bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
                          SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
                          SDValue &SLC) const;
@@ -206,7 +207,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
 
   bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
                          SDValue &Offset, SDValue &GLC, SDValue &SLC,
-                         SDValue &TFE, SDValue &DLC, SDValue &SWZ) const;
+                         SDValue &TFE, SDValue &DLC, SDValue &SWZ,
+                         SDValue &SCCB) const;
   bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
                          SDValue &Offset, SDValue &SLC) const;
   bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
@@ -322,6 +324,16 @@ static SDValue stripBitcast(SDValue Val) {
 // Figure out if this is really an extract of the high 16-bits of a dword.
 static bool isExtractHiElt(SDValue In, SDValue &Out) {
   In = stripBitcast(In);
+
+  if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
+      if (!Idx->isOne())
+        return false;
+      Out = In.getOperand(0);
+      return true;
+    }
+  }
+
   if (In.getOpcode() != ISD::TRUNCATE)
     return false;
 
@@ -341,6 +353,13 @@ static bool isExtractHiElt(SDValue In, SDValue &Out) {
 // Look through operations that obscure just looking at the low 16-bits of the
 // same register.
 static SDValue stripExtractLoElt(SDValue In) {
+  if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
+      if (Idx->isNullValue() && In.getValueSizeInBits() <= 32)
+        return In.getOperand(0);
+    }
+  }
+
   if (In.getOpcode() == ISD::TRUNCATE) {
     SDValue Src = In.getOperand(0);
     if (Src.getValueType().getSizeInBits() == 32)
@@ -1380,7 +1399,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
                                      SDValue &Idxen, SDValue &Addr64,
                                      SDValue &GLC, SDValue &SLC,
                                      SDValue &TFE, SDValue &DLC,
-                                     SDValue &SWZ) const {
+                                     SDValue &SWZ, SDValue &SCCB) const {
   // Subtarget prefers to use flat instruction
   // FIXME: This should be a pattern predicate and not reach here
   if (Subtarget->useFlatForGlobal())
@@ -1395,6 +1414,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
   TFE = CurDAG->getTargetConstant(0, DL, MVT::i1);
   DLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
   SWZ = CurDAG->getTargetConstant(0, DL, MVT::i1);
+  SCCB = CurDAG->getTargetConstant(0, DL, MVT::i1);
 
   Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
   Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
@@ -1474,7 +1494,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
                                            SDValue &VAddr, SDValue &SOffset,
                                            SDValue &Offset, SDValue &GLC,
                                            SDValue &SLC, SDValue &TFE,
-                                           SDValue &DLC, SDValue &SWZ) const {
+                                           SDValue &DLC, SDValue &SWZ,
+                                           SDValue &SCCB) const {
   SDValue Ptr, Offen, Idxen, Addr64;
 
   // addr64 bit was removed for volcanic islands.
@@ -1483,7 +1504,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
     return false;
 
   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
-              GLC, SLC, TFE, DLC, SWZ))
+              GLC, SLC, TFE, DLC, SWZ, SCCB))
     return false;
 
   ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
@@ -1505,9 +1526,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
                                            SDValue &Offset,
                                            SDValue &SLC) const {
   SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);
-  SDValue GLC, TFE, DLC, SWZ;
+  SDValue GLC, TFE, DLC, SWZ, SCCB;
 
-  return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC, SWZ);
+  return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC, SWZ, SCCB);
 }
 
 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
@@ -1631,13 +1652,13 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
                                            SDValue &SOffset, SDValue &Offset,
                                            SDValue &GLC, SDValue &SLC,
                                            SDValue &TFE, SDValue &DLC,
-                                           SDValue &SWZ) const {
+                                           SDValue &SWZ, SDValue &SCCB) const {
   SDValue Ptr, VAddr, Offen, Idxen, Addr64;
   const SIInstrInfo *TII =
     static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
 
   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
-              GLC, SLC, TFE, DLC, SWZ))
+              GLC, SLC, TFE, DLC, SWZ, SCCB))
     return false;
 
   if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
@@ -1659,16 +1680,16 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
                                            SDValue &Soffset, SDValue &Offset
                                            ) const {
-  SDValue GLC, SLC, TFE, DLC, SWZ;
+  SDValue GLC, SLC, TFE, DLC, SWZ, SCCB;
 
-  return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ);
+  return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ, SCCB);
 }
 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
                                            SDValue &Soffset, SDValue &Offset,
                                            SDValue &SLC) const {
-  SDValue GLC, TFE, DLC, SWZ;
+  SDValue GLC, TFE, DLC, SWZ, SCCB;
 
-  return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ);
+  return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ, SCCB);
 }
 
 // Find a load or store from corresponding pattern root.
@@ -2773,18 +2794,62 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
     if (isExtractHiElt(Hi, Hi))
       Mods |= SISrcMods::OP_SEL_1;
 
+    unsigned VecSize = Src.getValueSizeInBits();
     Lo = stripExtractLoElt(Lo);
     Hi = stripExtractLoElt(Hi);
 
+    if (Lo.getValueSizeInBits() > VecSize) {
+      Lo = CurDAG->getTargetExtractSubreg(
+        (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
+        MVT::getIntegerVT(VecSize), Lo);
+    }
+
+    if (Hi.getValueSizeInBits() > VecSize) {
+      Hi = CurDAG->getTargetExtractSubreg(
+        (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
+        MVT::getIntegerVT(VecSize), Hi);
+    }
+
+    assert(Lo.getValueSizeInBits() <= VecSize &&
+           Hi.getValueSizeInBits() <= VecSize);
+
     if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
       // Really a scalar input. Just select from the low half of the register to
       // avoid packing.
 
-      Src = Lo;
+      if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
+        Src = Lo;
+      } else {
+        assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
+
+        SDLoc SL(In);
+        SDValue Undef = SDValue(
+          CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
+                                 Lo.getValueType()), 0);
+        auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
+                                    : AMDGPU::SReg_64RegClassID;
+        const SDValue Ops[] = {
+          CurDAG->getTargetConstant(RC, SL, MVT::i32),
+          Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
+          Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
+
+        Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
+                                             Src.getValueType(), Ops), 0);
+      }
       SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
       return true;
     }
 
+    if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
+      uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
+                      .bitcastToAPInt().getZExtValue();
+      if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
+        Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);;
+        SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+        return true;
+      }
+    }
+
     Mods = VecMods;
   }
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index afdca239b0ff..c7abfd5d50d6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4345,6 +4345,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
   NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
   NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
+  NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
+  NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
 
   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
   }

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index ce3618f83130..25b20999e0f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -523,6 +523,8 @@ enum NodeType : unsigned {
   BUFFER_ATOMIC_CMPSWAP,
   BUFFER_ATOMIC_CSUB,
   BUFFER_ATOMIC_FADD,
+  BUFFER_ATOMIC_FMIN,
+  BUFFER_ATOMIC_FMAX,
 
   LAST_AMDGPU_ISD_NUMBER
 };

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index de92ce0e4dde..1ccef774107b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1456,7 +1456,7 @@ static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
 }
 
 static bool parseCachePolicy(uint64_t Value,
-                             bool *GLC, bool *SLC, bool *DLC) {
+                             bool *GLC, bool *SLC, bool *DLC, bool *SCC) {
   if (GLC) {
     *GLC = (Value & 0x1) ? 1 : 0;
     Value &= ~(uint64_t)0x1;
@@ -1469,6 +1469,10 @@ static bool parseCachePolicy(uint64_t Value,
     *DLC = (Value & 0x4) ? 1 : 0;
     Value &= ~(uint64_t)0x4;
   }
+  if (SCC) {
+    *SCC = (Value & 0x10) ? 1 : 0;
+    Value &= ~(uint64_t)0x10;
+  }
 
   return Value == 0;
 }
@@ -1601,16 +1605,17 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   bool GLC = false;
   bool SLC = false;
   bool DLC = false;
+  bool SCC = false;
   if (BaseOpcode->Atomic) {
     GLC = true; // TODO no-return optimization
     if (!parseCachePolicy(
             MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), nullptr,
-            &SLC, IsGFX10Plus ? &DLC : nullptr))
+            &SLC, IsGFX10Plus ? &DLC : nullptr, &SCC))
       return false;
   } else {
     if (!parseCachePolicy(
             MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), &GLC,
-            &SLC, IsGFX10Plus ? &DLC : nullptr))
+            &SLC, IsGFX10Plus ? &DLC : nullptr, &SCC))
       return false;
   }
 
@@ -1700,6 +1705,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   MIB.addImm(Unorm);
   if (IsGFX10Plus)
     MIB.addImm(DLC);
+  else
+    MIB.addImm(SCC);
 
   MIB.addImm(GLC);
   MIB.addImm(SLC);
@@ -2904,6 +2911,8 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
 
 bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
   MachineInstr &MI) const {
+  if (STI.hasGFX90AInsts())
+    return selectImpl(MI, *CoverageInfo);
 
   MachineBasicBlock *MBB = MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
@@ -2988,6 +2997,9 @@ bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
 bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic(
   MachineInstr &MI) const{
 
+  if (STI.hasGFX90AInsts())
+    return selectImpl(MI, *CoverageInfo);
+
   MachineBasicBlock *MBB = MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
 
@@ -3013,6 +3025,7 @@ bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic(
     .addReg(Data)
     .addImm(Addr.second)
     .addImm(0) // SLC
+    .addImm(0) // SSCB
     .cloneMemRefs(MI);
 
   MI.eraseFromParent();
@@ -4143,7 +4156,8 @@ AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
       addZeroImm, //  slc
       addZeroImm, //  tfe
       addZeroImm, //  dlc
-      addZeroImm  //  swz
+      addZeroImm, //  swz
+      addZeroImm  //  scc
     }};
 }
 
@@ -4171,7 +4185,8 @@ AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
       addZeroImm, //  slc
       addZeroImm, //  tfe
       addZeroImm, //  dlc
-      addZeroImm  //  swz
+      addZeroImm, //  swz
+      addZeroImm  //  scc
     }};
 }
 
@@ -4345,6 +4360,13 @@ void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
 }
 
+void AMDGPUInstructionSelector::renderExtractSCCB(MachineInstrBuilder &MIB,
+                                                  const MachineInstr &MI,
+                                                  int OpIdx) const {
+  assert(OpIdx >= 0 && "expected to match an immediate operand");
+  MIB.addImm((MI.getOperand(OpIdx).getImm() >> 4) & 1);
+}
+
 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
                                                  const MachineInstr &MI,
                                                  int OpIdx) const {

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index ea561deb5f61..b604cadef8fe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -291,6 +291,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
                         int OpIdx) const;
   void renderExtractSWZ(MachineInstrBuilder &MIB, const MachineInstr &MI,
                         int OpIdx) const;
+  void renderExtractSCCB(MachineInstrBuilder &MIB, const MachineInstr &MI,
+                         int OpIdx) const;
+
   void renderFrameIndex(MachineInstrBuilder &MIB, const MachineInstr &MI,
                         int OpIdx) const;
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 322ad1798009..931ce9908823 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1287,8 +1287,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   }
 
   if (ST.hasLDSFPAtomics()) {
-    getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
+    auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
+    if (ST.hasGFX90AInsts())
+      Atomic.legalFor({{S64, LocalPtr}});
   }
 
   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
@@ -3903,9 +3905,16 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
+  case Intrinsic::amdgcn_buffer_atomic_fadd:
   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
+  case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
+  case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
+    return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
+  case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
+  case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
+    return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
   default:
     llvm_unreachable("unhandled atomic opcode");
   }
@@ -4815,6 +4824,11 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
+  case Intrinsic::amdgcn_buffer_atomic_fadd:
+  case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
+  case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
+  case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
+  case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
     return legalizeBufferAtomic(MI, B, IntrID);
   case Intrinsic::amdgcn_atomic_inc:
     return legalizeAtomicIncDec(MI, B, true);

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index bb3d0bf8304a..59279b6819fa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1714,6 +1714,15 @@ static unsigned extractDLC(unsigned CachePolicy) {
   return (CachePolicy >> 2) & 1;
 }
 
+static unsigned extractSWZ(unsigned CachePolicy) {
+  return (CachePolicy >> 3) & 1;
+}
+
+static unsigned extractSCCB(unsigned CachePolicy) {
+  return (CachePolicy >> 4) & 1;
+}
+
+
 MachineInstr *
 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
                                              MachineInstr &MI) const {
@@ -1782,6 +1791,8 @@ AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
      .addImm(extractSLC(CachePolicy))
      .addImm(0) // tfe: FIXME: Remove from inst
      .addImm(extractDLC(CachePolicy))
+     .addImm(extractSWZ(CachePolicy))
+     .addImm(extractSCCB(CachePolicy))
      .cloneMemRefs(MI);
 
   // FIXME: We need a way to report failure from applyMappingImpl.
@@ -2839,7 +2850,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     executeInWaterfallLoop(MI, MRI, {2, 5});
     return;
   }
-  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
     applyDefaultMapping(OpdMapper);
     executeInWaterfallLoop(MI, MRI, {2, 5});
     return;
@@ -3807,7 +3820,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
-  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
     // vdata_out
     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
 
@@ -4064,7 +4079,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
     case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
     case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
-    case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: {
+    case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
+    case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
+    case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
+    case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
+    case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
+    case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
+    case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
+    case Intrinsic::amdgcn_mfma_f64_4x4x4f64: {
       // Default for MAI intrinsics.
       // srcC can also be an immediate which can be folded later.
       // FIXME: Should we eventually add an alternative mapping with AGPR src
@@ -4138,6 +4160,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     }
     case Intrinsic::amdgcn_global_atomic_fadd:
     case Intrinsic::amdgcn_global_atomic_csub:
+    case Intrinsic::amdgcn_global_atomic_fmin:
+    case Intrinsic::amdgcn_global_atomic_fmax:
+    case Intrinsic::amdgcn_flat_atomic_fadd:
+    case Intrinsic::amdgcn_flat_atomic_fmin:
+    case Intrinsic::amdgcn_flat_atomic_fmax:
       return getDefaultMappingAllVGPR(MI);
     case Intrinsic::amdgcn_ds_ordered_add:
     case Intrinsic::amdgcn_ds_ordered_swap: {

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index e63294b55ea8..afe016731395 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -199,6 +199,12 @@ def : SourceOfDivergence<int_r600_read_tidig_z>;
 def : SourceOfDivergence<int_amdgcn_atomic_inc>;
 def : SourceOfDivergence<int_amdgcn_atomic_dec>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
+def : SourceOfDivergence<int_amdgcn_global_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>;
+def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>;
+def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
+def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;
 def : SourceOfDivergence<int_amdgcn_ds_fadd>;
 def : SourceOfDivergence<int_amdgcn_ds_fmin>;
 def : SourceOfDivergence<int_amdgcn_ds_fmax>;
@@ -226,6 +232,8 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>;
@@ -240,6 +248,8 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
 def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>;
 def : SourceOfDivergence<int_amdgcn_ps_live>;
@@ -275,6 +285,13 @@ def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x4i8>;
 def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x8i8>;
 def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2bf16>;
 def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16_1k>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4bf16_1k>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x4bf16_1k>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8bf16_1k>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16bf16_1k>;
+def : SourceOfDivergence<int_amdgcn_mfma_f64_16x16x4f64>;
+def : SourceOfDivergence<int_amdgcn_mfma_f64_4x4x4f64>;
 
 // The dummy boolean output is divergent from the IR's perspective,
 // but the mask results are uniform. These produce a divergent and

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index f1a7d7463676..ec34237b201c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -176,6 +176,7 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
 
 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
   TargetTriple(TT),
+  GCN3Encoding(false),
   Has16BitInsts(false),
   HasMadMixInsts(false),
   HasMadMacF32Insts(false),
@@ -207,6 +208,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     FastFMAF32(false),
     FastDenormalF32(false),
     HalfRate64Ops(false),
+    FullRate64Ops(false),
 
     FlatForGlobal(false),
     AutoWaitcntBeforeBarrier(false),
@@ -216,6 +218,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     HasApertureRegs(false),
     SupportsXNACK(false),
     EnableXNACK(false),
+    EnableTgSplit(false),
     EnableCuMode(false),
     TrapHandler(false),
 
@@ -227,10 +230,10 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     DumpCode(false),
 
     FP64(false),
-    GCN3Encoding(false),
     CIInsts(false),
     GFX8Insts(false),
     GFX9Insts(false),
+    GFX90AInsts(false),
     GFX10Insts(false),
     GFX10_3Insts(false),
     GFX7GFX8GFX9Insts(false),
@@ -249,6 +252,9 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     HasSDWAOutModsVOPC(false),
     HasDPP(false),
     HasDPP8(false),
+    Has64BitDPP(false),
+    HasPackedFP32Ops(false),
+    HasExtendedImageInsts(false),
     HasR128A16(false),
     HasGFX10A16(false),
     HasG16(false),
@@ -284,6 +290,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     HasMFMAInlineLiteralBug(false),
     UnalignedBufferAccess(false),
     UnalignedDSAccess(false),
+    HasPackedTID(false),
 
     ScalarizeGlobal(false),
 
@@ -776,6 +783,9 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
     unsigned Requested = AMDGPU::getIntegerAttribute(
       F, "amdgpu-num-vgpr", MaxNumVGPRs);
 
+    if (hasGFX90AInsts())
+      Requested *= 2;
+
     // Make sure requested value is compatible with values implied by
     // default/requested minimum/maximum number of waves per execution unit.
     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index ba3a8acae551..08576356255f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -45,6 +45,7 @@ class AMDGPUSubtarget {
   Triple TargetTriple;
 
 protected:
+  bool GCN3Encoding;
   bool Has16BitInsts;
   bool HasMadMixInsts;
   bool HasMadMacF32Insts;
@@ -124,6 +125,10 @@ class AMDGPUSubtarget {
     return TargetTriple.getArch() == Triple::amdgcn;
   }
 
+  bool isGCN3Encoding() const {
+    return GCN3Encoding;
+  }
+
   bool has16BitInsts() const {
     return Has16BitInsts;
   }

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 7b8a79640bb2..360a2c71160b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -311,7 +311,7 @@ unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
 }
 
 unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
-  return 32;
+  return (Vector && ST->hasPackedFP32Ops()) ? 64 : 32;
 }
 
 unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
@@ -321,7 +321,9 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
   if (Opcode == Instruction::Load || Opcode == Instruction::Store)
     return 32 * 4 / ElemWidth;
-  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 : 1;
+  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
+       : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
+       : 1;
 }
 
 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -628,6 +630,8 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     LLVM_FALLTHROUGH;
   case ISD::FADD:
   case ISD::FSUB:
+    if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
+      NElts = (NElts + 1) / 2;
     if (SLT == MVT::f64)
       return LT.first * NElts * get64BitInstrCost(CostKind);
 
@@ -779,7 +783,8 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   if (SLT == MVT::f64)
     return LT.first * NElts * get64BitInstrCost(CostKind);
 
-  if (ST->has16BitInsts() && SLT == MVT::f16)
+  if ((ST->has16BitInsts() && SLT == MVT::f16) ||
+      (ST->hasPackedFP32Ops() && SLT == MVT::f32))
     NElts = (NElts + 1) / 2;
 
   // TODO: Get more refined intrinsic costs?
@@ -1192,8 +1197,10 @@ void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
 }
 
 int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
-  return ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
-                                : getQuarterRateInstrCost(CostKind);
+  return ST->hasFullRate64Ops()
+             ? getFullRateInstrCost()
+             : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
+                                      : getQuarterRateInstrCost(CostKind);
 }
 
 R600TTIImpl::R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)

diff  --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index dda528ab4b3f..a0616922d317 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -11,6 +11,7 @@
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
 #include "SIDefines.h"
 #include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
 #include "TargetInfo/AMDGPUTargetInfo.h"
 #include "Utils/AMDGPUAsmUtils.h"
 #include "Utils/AMDGPUBaseInfo.h"
@@ -114,6 +115,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     ImmTyOffset0,
     ImmTyOffset1,
     ImmTyDLC,
+    ImmTySCCB,
     ImmTyGLC,
     ImmTySLC,
     ImmTySWZ,
@@ -299,6 +301,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isRegKind() && getReg() == AMDGPU::SGPR_NULL;
   }
 
+  bool isVRegWithInputMods() const;
+
   bool isSDWAOperand(MVT type) const;
   bool isSDWAFP16Operand() const;
   bool isSDWAFP32Operand() const;
@@ -337,6 +341,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
   bool isGDS() const { return isImmTy(ImmTyGDS); }
   bool isLDS() const { return isImmTy(ImmTyLDS); }
   bool isDLC() const { return isImmTy(ImmTyDLC); }
+  bool isSCCB() const { return isImmTy(ImmTySCCB); }
   bool isGLC() const { return isImmTy(ImmTyGLC); }
   // "GLC_1" is a MatchClass of the GLC_1 operand with the default and forced
   // value of the GLC operand.
@@ -449,6 +454,26 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isSSrcF16();
   }
 
+  bool isSSrcV2FP32() const {
+    llvm_unreachable("cannot happen");
+    return isSSrcF32();
+  }
+
+  bool isSCSrcV2FP32() const {
+    llvm_unreachable("cannot happen");
+    return isSCSrcF32();
+  }
+
+  bool isSSrcV2INT32() const {
+    llvm_unreachable("cannot happen");
+    return isSSrcB32();
+  }
+
+  bool isSCSrcV2INT32() const {
+    llvm_unreachable("cannot happen");
+    return isSCSrcB32();
+  }
+
   bool isSSrcOrLdsB32() const {
     return isRegOrInlineNoMods(AMDGPU::SRegOrLds_32RegClassID, MVT::i32) ||
            isLiteralImm(MVT::i32) || isExpr();
@@ -502,6 +527,22 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isVSrcB16() || isLiteralImm(MVT::v2i16);
   }
 
+  bool isVCSrcV2FP32() const {
+    return isVCSrcF64();
+  }
+
+  bool isVSrcV2FP32() const {
+    return isVSrcF64() || isLiteralImm(MVT::v2f32);
+  }
+
+  bool isVCSrcV2INT32() const {
+    return isVCSrcB64();
+  }
+
+  bool isVSrcV2INT32() const {
+    return isVSrcB64() || isLiteralImm(MVT::v2i32);
+  }
+
   bool isVSrcF32() const {
     return isVCSrcF32() || isLiteralImm(MVT::f32) || isExpr();
   }
@@ -542,6 +583,102 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isVISrcF16() || isVISrcB32();
   }
 
+  bool isVISrc_64B64() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i64);
+  }
+
+  bool isVISrc_64F64() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::f64);
+  }
+
+  bool isVISrc_64V2FP32() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::f32);
+  }
+
+  bool isVISrc_64V2INT32() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32);
+  }
+
+  bool isVISrc_256B64() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i64);
+  }
+
+  bool isVISrc_256F64() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f64);
+  }
+
+  bool isVISrc_128B16() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::i16);
+  }
+
+  bool isVISrc_128V2B16() const {
+    return isVISrc_128B16();
+  }
+
+  bool isVISrc_128B32() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::i32);
+  }
+
+  bool isVISrc_128F32() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::f32);
+  }
+
+  bool isVISrc_256V2FP32() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f32);
+  }
+
+  bool isVISrc_256V2INT32() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i32);
+  }
+
+  bool isVISrc_512B32() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::i32);
+  }
+
+  bool isVISrc_512B16() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::i16);
+  }
+
+  bool isVISrc_512V2B16() const {
+    return isVISrc_512B16();
+  }
+
+  bool isVISrc_512F32() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::f32);
+  }
+
+  bool isVISrc_512F16() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::f16);
+  }
+
+  bool isVISrc_512V2F16() const {
+    return isVISrc_512F16() || isVISrc_512B32();
+  }
+
+  bool isVISrc_1024B32() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_1024RegClassID, MVT::i32);
+  }
+
+  bool isVISrc_1024B16() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_1024RegClassID, MVT::i16);
+  }
+
+  bool isVISrc_1024V2B16() const {
+    return isVISrc_1024B16();
+  }
+
+  bool isVISrc_1024F32() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_1024RegClassID, MVT::f32);
+  }
+
+  bool isVISrc_1024F16() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_1024RegClassID, MVT::f16);
+  }
+
+  bool isVISrc_1024V2F16() const {
+    return isVISrc_1024F16() || isVISrc_1024B32();
+  }
+
   bool isAISrcB32() const {
     return isRegOrInlineNoMods(AMDGPU::AGPR_32RegClassID, MVT::i32);
   }
@@ -566,6 +703,14 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isAISrcF16() || isAISrcB32();
   }
 
+  bool isAISrc_64B64() const {
+    return isRegOrInlineNoMods(AMDGPU::AReg_64RegClassID, MVT::i64);
+  }
+
+  bool isAISrc_64F64() const {
+    return isRegOrInlineNoMods(AMDGPU::AReg_64RegClassID, MVT::f64);
+  }
+
   bool isAISrc_128B32() const {
     return isRegOrInlineNoMods(AMDGPU::AReg_128RegClassID, MVT::i32);
   }
@@ -590,6 +735,22 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isAISrc_128F16() || isAISrc_128B32();
   }
 
+  bool isVISrc_128F16() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::f16);
+  }
+
+  bool isVISrc_128V2F16() const {
+    return isVISrc_128F16() || isVISrc_128B32();
+  }
+
+  bool isAISrc_256B64() const {
+    return isRegOrInlineNoMods(AMDGPU::AReg_256RegClassID, MVT::i64);
+  }
+
+  bool isAISrc_256F64() const {
+    return isRegOrInlineNoMods(AMDGPU::AReg_256RegClassID, MVT::f64);
+  }
+
   bool isAISrc_512B32() const {
     return isRegOrInlineNoMods(AMDGPU::AReg_512RegClassID, MVT::i32);
   }
@@ -838,6 +999,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     case ImmTyOffset0: OS << "Offset0"; break;
     case ImmTyOffset1: OS << "Offset1"; break;
     case ImmTyDLC: OS << "DLC"; break;
+    case ImmTySCCB: OS << "SCCB"; break;
     case ImmTyGLC: OS << "GLC"; break;
     case ImmTySLC: OS << "SLC"; break;
     case ImmTySWZ: OS << "SWZ"; break;
@@ -1197,6 +1359,10 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
     return AMDGPU::isGFX9(getSTI());
   }
 
+  bool isGFX90A() const {
+    return AMDGPU::isGFX90A(getSTI());
+  }
+
   bool isGFX9Plus() const {
     return AMDGPU::isGFX9Plus(getSTI());
   }
@@ -1384,6 +1550,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   bool validateVccOperand(unsigned Reg) const;
   bool validateVOP3Literal(const MCInst &Inst, const OperandVector &Operands);
   bool validateMAIAccWrite(const MCInst &Inst, const OperandVector &Operands);
+  bool validateAGPRLdSt(const MCInst &Inst) const;
+  bool validateVGPRAlign(const MCInst &Inst) const;
   bool validateDivScale(const MCInst &Inst);
   bool validateCoherencyBits(const MCInst &Inst, const OperandVector &Operands,
                              const SMLoc &IDLoc);
@@ -1459,6 +1627,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   void cvtMtbuf(MCInst &Inst, const OperandVector &Operands);
 
   AMDGPUOperand::Ptr defaultDLC() const;
+  AMDGPUOperand::Ptr defaultSCCB() const;
   AMDGPUOperand::Ptr defaultGLC() const;
   AMDGPUOperand::Ptr defaultGLC_1() const;
   AMDGPUOperand::Ptr defaultSLC() const;
@@ -1553,11 +1722,16 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
   case AMDGPU::OPERAND_REG_INLINE_C_FP32:
   case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
   case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
+  case AMDGPU::OPERAND_REG_IMM_V2FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
+  case AMDGPU::OPERAND_REG_IMM_V2INT32:
     return &APFloat::IEEEsingle();
   case AMDGPU::OPERAND_REG_IMM_INT64:
   case AMDGPU::OPERAND_REG_IMM_FP64:
   case AMDGPU::OPERAND_REG_INLINE_C_INT64:
   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
     return &APFloat::IEEEdouble();
   case AMDGPU::OPERAND_REG_IMM_INT16:
   case AMDGPU::OPERAND_REG_IMM_FP16:
@@ -1717,7 +1891,8 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
   // literal goes into the lower half and the upper half is zero. We also
   // require that the literal may be losslesly converted to f16.
   MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 :
-                     (type == MVT::v2i16)? MVT::i16 : type;
+                     (type == MVT::v2i16)? MVT::i16 :
+                     (type == MVT::v2f32)? MVT::f32 : type;
 
   APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val));
   return canLosslesslyConvertToFPType(FPLiteral, ExpectedType);
@@ -1727,6 +1902,13 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const {
   return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
 }
 
+bool AMDGPUOperand::isVRegWithInputMods() const {
+  return isRegClass(AMDGPU::VGPR_32RegClassID) ||
+         // GFX90A allows DPP on 64-bit operands.
+         (isRegClass(AMDGPU::VReg_64RegClassID) &&
+          AsmParser->getFeatureBits()[AMDGPU::Feature64BitDPP]);
+}
+
 bool AMDGPUOperand::isSDWAOperand(MVT type) const {
   if (AsmParser->isVI())
     return isVReg32();
@@ -1808,6 +1990,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
     case AMDGPU::OPERAND_REG_IMM_FP64:
     case AMDGPU::OPERAND_REG_INLINE_C_INT64:
     case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+    case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
       if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(),
                                        AsmParser->hasInv2PiInlineImm())) {
         Inst.addOperand(MCOperand::createImm(Literal.getZExtValue()));
@@ -1851,7 +2034,11 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
     case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
     case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
     case AMDGPU::OPERAND_REG_IMM_V2INT16:
-    case AMDGPU::OPERAND_REG_IMM_V2FP16: {
+    case AMDGPU::OPERAND_REG_IMM_V2FP16:
+    case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
+    case AMDGPU::OPERAND_REG_IMM_V2FP32:
+    case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
+    case AMDGPU::OPERAND_REG_IMM_V2INT32: {
       bool lost;
       APFloat FPLiteral(APFloat::IEEEdouble(), Literal);
       // Convert literal to single precision
@@ -1883,6 +2070,10 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
   case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
   case AMDGPU::OPERAND_REG_IMM_V2INT16:
   case AMDGPU::OPERAND_REG_IMM_V2FP16:
+  case AMDGPU::OPERAND_REG_IMM_V2FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
+  case AMDGPU::OPERAND_REG_IMM_V2INT32:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
     if (isSafeTruncation(Val, 32) &&
         AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val),
                                      AsmParser->hasInv2PiInlineImm())) {
@@ -1899,6 +2090,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
   case AMDGPU::OPERAND_REG_IMM_FP64:
   case AMDGPU::OPERAND_REG_INLINE_C_INT64:
   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
     if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) {
       Inst.addOperand(MCOperand::createImm(Val));
       setImmKindConst();
@@ -3202,7 +3394,7 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) {
     return true;
 
   unsigned VDataSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VDataIdx);
-  unsigned TFESize = Inst.getOperand(TFEIdx).getImm()? 1 : 0;
+  unsigned TFESize = (TFEIdx != -1 && Inst.getOperand(TFEIdx).getImm()) ? 1 : 0;
   unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf;
   if (DMask == 0)
     DMask = 1;
@@ -3804,6 +3996,79 @@ bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst,
   return true;
 }
 
+// Returns -1 if not a register, 0 if VGPR and 1 if AGPR.
+static int IsAGPROperand(const MCInst &Inst, uint16_t NameIdx,
+                         const MCRegisterInfo *MRI) {
+  int OpIdx = AMDGPU::getNamedOperandIdx(Inst.getOpcode(), NameIdx);
+  if (OpIdx < 0)
+    return -1;
+
+  const MCOperand &Op = Inst.getOperand(OpIdx);
+  if (!Op.isReg())
+    return -1;
+
+  unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
+  auto Reg = Sub ? Sub : Op.getReg();
+  const MCRegisterClass &AGRP32 = MRI->getRegClass(AMDGPU::AGPR_32RegClassID);
+  return AGRP32.contains(Reg) ? 1 : 0;
+}
+
+bool AMDGPUAsmParser::validateAGPRLdSt(const MCInst &Inst) const {
+  uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+  if ((TSFlags & (SIInstrFlags::FLAT | SIInstrFlags::MUBUF |
+                  SIInstrFlags::MTBUF | SIInstrFlags::MIMG |
+                  SIInstrFlags::DS)) == 0)
+    return true;
+
+  uint16_t DataNameIdx = (TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
+                                                      : AMDGPU::OpName::vdata;
+
+  const MCRegisterInfo *MRI = getMRI();
+  int DstAreg = IsAGPROperand(Inst, AMDGPU::OpName::vdst, MRI);
+  int DataAreg = IsAGPROperand(Inst, DataNameIdx, MRI);
+
+  if ((TSFlags & SIInstrFlags::DS) && DataAreg >= 0) {
+    int Data2Areg = IsAGPROperand(Inst, AMDGPU::OpName::data1, MRI);
+    if (Data2Areg >= 0 && Data2Areg != DataAreg)
+      return false;
+  }
+
+  auto FB = getFeatureBits();
+  if (FB[AMDGPU::FeatureGFX90AInsts]) {
+    if (DataAreg < 0 || DstAreg < 0)
+      return true;
+    return DstAreg == DataAreg;
+  }
+
+  return DstAreg < 1 && DataAreg < 1;
+}
+
+bool AMDGPUAsmParser::validateVGPRAlign(const MCInst &Inst) const {
+  auto FB = getFeatureBits();
+  if (!FB[AMDGPU::FeatureGFX90AInsts])
+    return true;
+
+  const MCRegisterInfo *MRI = getMRI();
+  const MCRegisterClass &VGRP32 = MRI->getRegClass(AMDGPU::VGPR_32RegClassID);
+  const MCRegisterClass &AGRP32 = MRI->getRegClass(AMDGPU::AGPR_32RegClassID);
+  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+    const MCOperand &Op = Inst.getOperand(I);
+    if (!Op.isReg())
+      continue;
+
+    unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
+    if (!Sub)
+      continue;
+
+    if (VGRP32.contains(Sub) && ((Sub - AMDGPU::VGPR0) & 1))
+      return false;
+    if (AGRP32.contains(Sub) && ((Sub - AMDGPU::AGPR0) & 1))
+      return false;
+  }
+
+  return true;
+}
+
 bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
                                             const OperandVector &Operands,
                                             const SMLoc &IDLoc) {
@@ -3895,6 +4160,23 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
   if (!validateMAIAccWrite(Inst, Operands)) {
     return false;
   }
+  if (!validateCoherencyBits(Inst, Operands, IDLoc)) {
+    return false;
+  }
+
+  if (!validateAGPRLdSt(Inst)) {
+    Error(IDLoc, getFeatureBits()[AMDGPU::FeatureGFX90AInsts]
+    ? "invalid register class: data and dst should be all VGPR or AGPR"
+    : "invalid register class: agpr loads and stores not supported on this GPU"
+    );
+    return false;
+  }
+  if (!validateVGPRAlign(Inst)) {
+    Error(IDLoc,
+      "invalid register class: vgpr tuples must be 64 bit aligned");
+    return false;
+  }
+
   if (!validateDivScale(Inst)) {
     Error(IDLoc, "ABS not allowed in VOP3B instructions");
     return false;
@@ -4145,6 +4427,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
 
   SMRange VGPRRange;
   uint64_t NextFreeVGPR = 0;
+  uint64_t AccumOffset = 0;
   SMRange SGPRRange;
   uint64_t NextFreeSGPR = 0;
   unsigned UserSGPRCount = 0;
@@ -4273,6 +4556,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
     } else if (ID == ".amdhsa_next_free_sgpr") {
       SGPRRange = ValRange;
       NextFreeSGPR = Val;
+    } else if (ID == ".amdhsa_accum_offset") {
+      if (!isGFX90A())
+        return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
+      AccumOffset = Val;
     } else if (ID == ".amdhsa_reserve_vcc") {
       if (!isUInt<1>(Val))
         return OutOfRangeError(ValRange);
@@ -4313,6 +4600,11 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
         return Error(IDRange.Start, "directive requires gfx9+", IDRange);
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FP16_OVFL, Val,
                        ValRange);
+    } else if (ID == ".amdhsa_tg_split") {
+      if (!isGFX90A())
+        return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Val,
+                       ValRange);
     } else if (ID == ".amdhsa_workgroup_processor_mode") {
       if (IVersion.Major < 10)
         return Error(IDRange.Start, "directive requires gfx10+", IDRange);
@@ -4397,6 +4689,18 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
   AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_USER_SGPR_COUNT,
                   UserSGPRCount);
 
+  if (isGFX90A()) {
+    if (Seen.find(".amdhsa_accum_offset") == Seen.end())
+      return TokError(".amdhsa_accum_offset directive is required");
+    if (AccumOffset < 4 || AccumOffset > 256 || (AccumOffset & 3))
+      return TokError("accum_offset should be in range [4..256] in "
+                      "increments of 4");
+    if (AccumOffset > alignTo(std::max((uint64_t)1, NextFreeVGPR), 4))
+      return TokError("accum_offset exceeds total VGPR allocation");
+    AMDHSA_BITS_SET(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
+                    (AccumOffset / 4 - 1));
+  }
+
   getTargetStreamer().EmitAmdhsaKernelDescriptor(
       getSTI(), KernelName, KD, NextFreeVGPR, NextFreeSGPR, ReserveVCC,
       ReserveFlatScr, ReserveXNACK);
@@ -5075,6 +5379,8 @@ AMDGPUAsmParser::parseNamedBit(StringRef Name, OperandVector &Operands,
     Error(S, "dlc modifier is not supported on this GPU");
     return MatchOperand_ParseFail;
   }
+  if (!isGFX90A() && ImmTy == AMDGPUOperand::ImmTySCCB)
+    return MatchOperand_ParseFail;
 
   if (isGFX9() && ImmTy == AMDGPUOperand::ImmTyA16)
     ImmTy = AMDGPUOperand::ImmTyR128A16;
@@ -6504,6 +6810,10 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDLC() const {
   return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDLC);
 }
 
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSCCB() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTySCCB);
+}
+
 AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC() const {
   return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyGLC);
 }
@@ -6586,8 +6896,9 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
   }
 
-  if (isGFX10Plus())
-    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySWZ);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySCCB);
 }
 
 void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
@@ -6625,9 +6936,9 @@ void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
-
-  if (isGFX10Plus())
-    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySWZ);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySCCB);
 }
 
 //===----------------------------------------------------------------------===//
@@ -6669,14 +6980,21 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
   if (IsGFX10Plus)
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDim, -1);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
+
+  if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::sccb) != -1)
+    addOptionalImmOperand(Inst, Operands, OptionalIdx,
+                          AMDGPUOperand::ImmTySCCB);
+
   if (IsGFX10Plus)
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
+
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128A16);
+  if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::tfe) != -1)
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
   if (IsGFX10Plus)
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyA16);
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
   if (!IsGFX10Plus)
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
@@ -6784,6 +7102,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"offset",  AMDGPUOperand::ImmTyOffset, false, nullptr},
   {"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr},
   {"dlc",     AMDGPUOperand::ImmTyDLC, true, nullptr},
+  {"scc",     AMDGPUOperand::ImmTySCCB, true, nullptr},
   {"glc",     AMDGPUOperand::ImmTyGLC, true, nullptr},
   {"slc",     AMDGPUOperand::ImmTySLC, true, nullptr},
   {"swz",     AMDGPUOperand::ImmTySWZ, true, nullptr},
@@ -7021,6 +7340,7 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
       Opc == AMDGPU::V_MAC_LEGACY_F32_e64_gfx6_gfx7 ||
       Opc == AMDGPU::V_MAC_LEGACY_F32_e64_gfx10 ||
       Opc == AMDGPU::V_MAC_F16_e64_vi ||
+      Opc == AMDGPU::V_FMAC_F64_e64_gfx90a ||
       Opc == AMDGPU::V_FMAC_F32_e64_gfx10 ||
       Opc == AMDGPU::V_FMAC_F32_e64_vi ||
       Opc == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 ||
@@ -7276,6 +7596,15 @@ OperandMatchResultTy AMDGPUAsmParser::parseDPP8(OperandVector &Operands) {
 bool
 AMDGPUAsmParser::isSupportedDPPCtrl(StringRef Ctrl,
                                     const OperandVector &Operands) {
+  if (Ctrl == "row_newbcast")
+      return isGFX90A();
+
+  // DPP64 is supported for row_newbcast only.
+  const MCRegisterInfo *MRI = getMRI();
+  if (Operands.size() > 2 && Operands[1]->isReg() &&
+      MRI->getSubReg(Operands[1]->getReg(), AMDGPU::sub1))
+    return false;
+
   if (Ctrl == "row_share" ||
       Ctrl == "row_xmask")
     return isGFX10Plus();
@@ -7353,6 +7682,7 @@ AMDGPUAsmParser::parseDPPCtrlSel(StringRef Ctrl) {
     .Case("row_ror",   {DppCtrl::ROW_ROR0,        1, 15})
     .Case("row_share", {DppCtrl::ROW_SHARE_FIRST, 0, 15})
     .Case("row_xmask", {DppCtrl::ROW_XMASK_FIRST, 0, 15})
+    .Case("row_newbcast", {DppCtrl::ROW_NEWBCAST_FIRST, 0, 15})
     .Default({-1, 0, 0});
 
   bool Valid;

diff  --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 4a67630c0809..d56d9b6d2e59 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -7,13 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
-def MUBUFAddr64 : ComplexPattern<i64, 9, "SelectMUBUFAddr64">;
+def MUBUFAddr64 : ComplexPattern<i64, 10, "SelectMUBUFAddr64">;
 def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
 
 def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>;
 def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>;
 
-def MUBUFOffset : ComplexPattern<i64, 8, "SelectMUBUFOffset">;
+def MUBUFOffset : ComplexPattern<i64, 9, "SelectMUBUFOffset">;
 def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
 def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
 
@@ -105,6 +105,8 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
   bits<1> has_slc     = 1;
   bits<1> has_tfe     = 1;
   bits<4> elements    = 0;
+  bits<1> has_sccb    = 1;
+  bits<1> sccb_value  = 0;
 }
 
 class MTBUF_Real <MTBUF_Pseudo ps> :
@@ -126,7 +128,7 @@ class MTBUF_Real <MTBUF_Pseudo ps> :
   bits<1>  dlc;
   bits<7>  format;
   bits<8>  vaddr;
-  bits<8>  vdata;
+  bits<10> vdata;
   bits<7>  srsrc;
   bits<1>  slc;
   bits<1>  tfe;
@@ -134,25 +136,31 @@ class MTBUF_Real <MTBUF_Pseudo ps> :
 
   bits<4> dfmt = format{3-0};
   bits<3> nfmt = format{6-4};
+
+  bits<1> sccb;
+  // GFX90A+ only: instruction uses AccVGPR for data
+  // Bit superceedes tfe.
+  bits<1> acc = !if(ps.has_vdata, vdata{9}, 0);
 }
 
 class getMTBUFInsDA<list<RegisterClass> vdataList,
                     list<RegisterClass> vaddrList=[]> {
   RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
   RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+  RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret;
   dag InsNoData = !if(!empty(vaddrList),
     (ins                    SReg_128:$srsrc, SCSrc_b32:$soffset,
-         offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz),
+         offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz, SCCB_0:$sccb),
     (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
-         offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz)
+         offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz, SCCB_0:$sccb)
   );
   dag InsData = !if(!empty(vaddrList),
-    (ins vdataClass:$vdata,                    SReg_128:$srsrc,
+    (ins vdata_op:$vdata,                    SReg_128:$srsrc,
          SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
-         SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz),
-    (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
+         SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz, SCCB_0:$sccb),
+    (ins vdata_op:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
          SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
-         SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz)
+         SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz, SCCB_0:$sccb)
   );
   dag ret = !if(!empty(vdataList), InsNoData, InsData);
 }
@@ -202,9 +210,9 @@ class MTBUF_Load_Pseudo <string opName,
                          // Workaround bug bz30254
                          int addrKindCopy = addrKind>
   : MTBUF_Pseudo<opName,
-                 (outs vdataClass:$vdata),
+                 (outs getLdStRegisterOperand<vdataClass>.ret:$vdata),
                  getMTBUFIns<addrKindCopy>.ret,
-                 " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz",
+                 " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz$sccb",
                  pattern>,
     MTBUF_SetupAddr<addrKindCopy> {
   let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
@@ -252,7 +260,7 @@ class MTBUF_Store_Pseudo <string opName,
   : MTBUF_Pseudo<opName,
                  (outs),
                  getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
-                 " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz",
+                 " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz$sccb",
                  pattern>,
     MTBUF_SetupAddr<addrKindCopy> {
   let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
@@ -341,6 +349,8 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
   bits<1> has_slc     = 1;
   bits<1> has_tfe     = 1;
   bits<4> elements    = 0;
+  bits<1> has_sccb    = 1;
+  bits<1> sccb_value  = 0;
 }
 
 class MUBUF_Real <MUBUF_Pseudo ps> :
@@ -362,11 +372,16 @@ class MUBUF_Real <MUBUF_Pseudo ps> :
   bits<1>  glc;
   bits<1>  dlc;
   bits<8>  vaddr;
-  bits<8>  vdata;
+  bits<10> vdata;
   bits<7>  srsrc;
   bits<1>  slc;
   bits<1>  tfe;
   bits<8>  soffset;
+
+  bits<1> sccb;
+  // GFX90A+ only: instruction uses AccVGPR for data
+  // Bit superceedes tfe.
+  bits<1> acc = !if(ps.has_vdata, vdata{9}, 0);
 }
 
 
@@ -395,6 +410,8 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> :
   let has_offset  = 0;
   let has_slc     = 0;
   let has_tfe     = 0;
+  let has_sccb    = 0;
+  let sccb_value  = 0;
 }
 
 class getMUBUFInsDA<list<RegisterClass> vdataList,
@@ -402,6 +419,7 @@ class getMUBUFInsDA<list<RegisterClass> vdataList,
                     bit isLds = 0> {
   RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
   RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+  RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret;
   dag InsNoData = !if(!empty(vaddrList),
     (ins                    SReg_128:$srsrc, SCSrc_b32:$soffset,
          offset:$offset, GLC:$glc, SLC:$slc),
@@ -409,14 +427,14 @@ class getMUBUFInsDA<list<RegisterClass> vdataList,
          offset:$offset, GLC:$glc, SLC:$slc)
   );
   dag InsData = !if(!empty(vaddrList),
-    (ins vdataClass:$vdata,                    SReg_128:$srsrc,
+    (ins vdata_op:$vdata,                    SReg_128:$srsrc,
          SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc),
-    (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
+    (ins vdata_op:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
          SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc)
   );
   dag ret = !con(
               !if(!empty(vdataList), InsNoData, InsData),
-              !if(isLds, (ins DLC:$dlc, SWZ:$swz), (ins TFE:$tfe, DLC:$dlc,SWZ:$swz))
+              !if(isLds, (ins DLC:$dlc, SWZ:$swz, SCCB_0:$sccb), (ins TFE:$tfe, DLC:$dlc, SWZ:$swz, SCCB_0:$sccb))
              );
 }
 
@@ -482,13 +500,15 @@ class MUBUF_Load_Pseudo <string opName,
                          bit isLds = 0,
                          list<dag> pattern=[],
                          // Workaround bug bz30254
-                         int addrKindCopy = addrKind>
+                         int addrKindCopy = addrKind,
+                         RegisterClass vdata_rc = getVregSrcForVT<vdata_vt>.ret,
+                         RegisterOperand vdata_op = getLdStRegisterOperand<vdata_rc>.ret>
   : MUBUF_Pseudo<opName,
-                 (outs getVregSrcForVT<vdata_vt>.ret:$vdata),
+                 (outs vdata_op:$vdata),
                  !con(getMUBUFIns<addrKindCopy, [], isLds>.ret,
-                      !if(HasTiedDest, (ins getVregSrcForVT<vdata_vt>.ret:$vdata_in), (ins))),
+                      !if(HasTiedDest, (ins vdata_op:$vdata_in), (ins))),
                  " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc" #
-                   !if(isLds, " lds", "$tfe") # "$dlc$swz",
+                   !if(isLds, " lds", "$tfe") # "$dlc$swz$sccb",
                  pattern>,
     MUBUF_SetupAddr<addrKindCopy> {
   let PseudoInstr = opName # !if(isLds, "_lds", "") #
@@ -567,7 +587,7 @@ class MUBUF_Store_Pseudo <string opName,
   : MUBUF_Pseudo<opName,
                  (outs),
                  getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret]>.ret,
-                 " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz",
+                 " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz$sccb",
                  pattern>,
     MUBUF_SetupAddr<addrKindCopy> {
   let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
@@ -608,8 +628,8 @@ multiclass MUBUF_Pseudo_Stores<string opName,
 class MUBUF_Pseudo_Store_Lds<string opName>
   : MUBUF_Pseudo<opName,
                  (outs),
-                 (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc, SWZ:$swz),
-                 " $srsrc, $soffset$offset lds$glc$slc$swz"> {
+                 (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc, SWZ:$swz, SCCB_0:$sccb),
+                 " $srsrc, $soffset$offset lds$glc$slc$swz$sccb"> {
   let mayLoad = 0;
   let mayStore = 1;
   let maybeAtomic = 1;
@@ -626,17 +646,18 @@ class MUBUF_Pseudo_Store_Lds<string opName>
 class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in,
                           list<RegisterClass> vaddrList=[]> {
   RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+  RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret;
   dag ret = !if(vdata_in,
     !if(!empty(vaddrList),
-      (ins vdataClass:$vdata_in,
+      (ins vdata_op:$vdata_in,
            SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC_1:$glc1, SLC:$slc),
-      (ins vdataClass:$vdata_in, vaddrClass:$vaddr,
+      (ins vdata_op:$vdata_in, vaddrClass:$vaddr,
            SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC_1:$glc1, SLC:$slc)
     ),
     !if(!empty(vaddrList),
-      (ins vdataClass:$vdata,
+      (ins vdata_op:$vdata,
            SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc),
-      (ins vdataClass:$vdata, vaddrClass:$vaddr,
+      (ins vdata_op:$vdata, vaddrClass:$vaddr,
            SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc)
   ));
 }
@@ -678,6 +699,7 @@ class MUBUF_Atomic_Pseudo<string opName,
   let has_glc = 0;
   let has_dlc = 0;
   let has_tfe = 0;
+  let has_sccb = 0;
   let maybeAtomic = 1;
 }
 
@@ -696,6 +718,7 @@ class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind,
   let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
   let glc_value = 0;
   let dlc_value = 0;
+  let sccb_value = 0;
   let IsAtomicNoRet = 1;
   let AsmMatchConverter = "cvtMubufAtomic";
 }
@@ -705,9 +728,10 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
                              list<dag> pattern=[],
                              // Workaround bug bz30254
                              int addrKindCopy = addrKind,
-                             RegisterClass vdataClassCopy = vdataClass>
+                             RegisterClass vdataClassCopy = vdataClass,
+                             RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret>
   : MUBUF_Atomic_Pseudo<opName, addrKindCopy,
-                        (outs vdataClassCopy:$vdata),
+                        (outs vdata_op:$vdata),
                         getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 1>.ret,
                         " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc1$slc",
                         pattern>,
@@ -715,6 +739,7 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
   let PseudoInstr = opName # "_rtn_" # getAddrName<addrKindCopy>.ret;
   let glc_value = 1;
   let dlc_value = 0;
+  let sccb_value = 0;
   let IsAtomicRet = 1;
   let Constraints = "$vdata = $vdata_in";
   let DisableEncoding = "$vdata_in";
@@ -1108,6 +1133,15 @@ defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN <
 defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
   "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_noret_32
 >;
+
+let OtherPredicates = [isGFX90APlus] in {
+defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN <
+  "buffer_atomic_add_f32", VGPR_32, f32, atomic_load_fadd_global_32
+>;
+defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN <
+  "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_32
+>;
+}
 } // End SubtargetPredicate = HasAtomicFaddInsts
 
 //===----------------------------------------------------------------------===//
@@ -1156,6 +1190,17 @@ def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol",
 
 } // End let SubtargetPredicate = isGFX7Plus
 
+let SubtargetPredicate = isGFX90APlus in {
+  def BUFFER_WBL2  : MUBUF_Invalidate<"buffer_wbl2"> {
+  }
+  def BUFFER_INVL2 : MUBUF_Invalidate<"buffer_invl2"> {
+  }
+
+  defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>;
+  defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
+  defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
+} // End SubtargetPredicate = isGFX90APlus
+
 let SubtargetPredicate = isGFX10Plus in {
   def BUFFER_GL0_INV : MUBUF_Invalidate<"buffer_gl0_inv">;
   def BUFFER_GL1_INV : MUBUF_Invalidate<"buffer_gl1_inv">;
@@ -1178,7 +1223,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
               timm:$auxiliary, 0)),
     (!cast<MUBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
-      (extract_swz $auxiliary))
+      (extract_swz $auxiliary), (extract_sccb $auxiliary))
   >;
 
   def : GCNPat<
@@ -1186,7 +1231,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
               timm:$auxiliary, 0)),
     (!cast<MUBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
-      (extract_swz $auxiliary))
+      (extract_swz $auxiliary), (extract_sccb $auxiliary))
   >;
 
   def : GCNPat<
@@ -1194,7 +1239,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
               timm:$auxiliary, timm)),
     (!cast<MUBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
-      (extract_swz $auxiliary))
+      (extract_swz $auxiliary), (extract_sccb $auxiliary))
   >;
 
   def : GCNPat<
@@ -1204,7 +1249,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
       (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
       SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
-      (extract_swz $auxiliary))
+      (extract_swz $auxiliary), (extract_sccb $auxiliary))
   >;
 }
 
@@ -1264,7 +1309,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
               timm:$auxiliary, 0),
     (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0,  (extract_dlc $auxiliary),
-      (extract_swz $auxiliary))
+      (extract_swz $auxiliary), (extract_sccb $auxiliary))
   >;
 
   def : GCNPat<
@@ -1273,7 +1318,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
     (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
       (as_i16timm $offset), (extract_glc $auxiliary),
       (extract_slc $auxiliary), 0,  (extract_dlc $auxiliary),
-      (extract_swz $auxiliary))
+      (extract_swz $auxiliary), (extract_sccb $auxiliary))
   >;
 
   def : GCNPat<
@@ -1282,7 +1327,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
     (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
       (as_i16timm $offset), (extract_glc $auxiliary),
       (extract_slc $auxiliary), 0,  (extract_dlc $auxiliary),
-      (extract_swz $auxiliary))
+      (extract_swz $auxiliary), (extract_sccb $auxiliary))
   >;
 
   def : GCNPat<
@@ -1293,7 +1338,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
       (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
       SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary),
       (extract_slc $auxiliary), 0,  (extract_dlc $auxiliary),
-      (extract_swz $auxiliary))
+      (extract_swz $auxiliary), (extract_sccb $auxiliary))
   >;
 }
 
@@ -1462,15 +1507,24 @@ defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, f32, "BUFFER_ATOMIC_ADD
 defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
 }
 
+let SubtargetPredicate = isGFX90APlus in {
+  defm : BufferAtomicPatterns<SIbuffer_atomic_fadd, f32,   "BUFFER_ATOMIC_ADD_F32">;
+  defm : BufferAtomicPatterns<SIbuffer_atomic_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
+
+  defm : BufferAtomicPatterns<SIbuffer_atomic_fadd, f64, "BUFFER_ATOMIC_ADD_F64">;
+  defm : BufferAtomicPatterns<SIbuffer_atomic_fmin, f64, "BUFFER_ATOMIC_MIN_F64">;
+  defm : BufferAtomicPatterns<SIbuffer_atomic_fmax, f64, "BUFFER_ATOMIC_MAX_F64">;
+} // End SubtargetPredicate = isGFX90APlus
+
 def : GCNPat<
   (SIbuffer_atomic_cmpswap
       i32:$data, i32:$cmp, v4i32:$rsrc, 0, 0, i32:$soffset,
       timm:$offset, timm:$cachepolicy, 0),
-  (EXTRACT_SUBREG
+  (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS
     (BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN
       (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
         SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
-        (extract_slc $cachepolicy)), sub0)
+        (extract_slc $cachepolicy)), VReg_64)), sub0)
 >;
 
 def : GCNPat<
@@ -1478,10 +1532,10 @@ def : GCNPat<
       i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
       0, i32:$soffset, timm:$offset,
       timm:$cachepolicy, timm),
-  (EXTRACT_SUBREG
+  (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS
     (BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN
       (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
-      VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)),
+      VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)), VReg_64)),
     sub0)
 >;
 
@@ -1490,10 +1544,10 @@ def : GCNPat<
       i32:$data, i32:$cmp, v4i32:$rsrc, 0,
       i32:$voffset, i32:$soffset, timm:$offset,
       timm:$cachepolicy, 0),
-  (EXTRACT_SUBREG
+  (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS
     (BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN
       (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
-      VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)),
+      VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)), VReg_64)),
     sub0)
 >;
 
@@ -1502,11 +1556,11 @@ def : GCNPat<
       i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
       i32:$voffset, i32:$soffset, timm:$offset,
       timm:$cachepolicy, timm),
-  (EXTRACT_SUBREG
+  (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS
     (BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN
       (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
       (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
-      SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)),
+      SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)), VReg_64)),
     sub0)
 >;
 
@@ -1522,12 +1576,12 @@ multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Ins
   def : GCNPat <
      (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
                                    i16:$offset, i1:$slc))),
-     (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0)
+     (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0, 0)
   >;
 
   def : GCNPat <
     (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))),
-    (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0)
+    (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0, 0)
   >;
 }
 
@@ -1572,12 +1626,12 @@ multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen,
   def : GCNPat <
     (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
                                i32:$soffset, u16imm:$offset))),
-    (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
+    (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, 0)
   >;
 
   def : GCNPat <
     (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))),
-    (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
+    (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, 0)
   >;
 }
 
@@ -1639,12 +1693,12 @@ multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo In
   def : GCNPat <
      (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
                                    i16:$offset, i1:$slc), vt:$val),
-     (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0)
+     (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0, 0)
   >;
 
   def : GCNPat <
     (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val),
-    (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0)
+    (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0, 0)
   >;
 }
 let SubtargetPredicate = isGFX6GFX7 in {
@@ -1673,13 +1727,13 @@ multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen,
   def : GCNPat <
     (st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
                                       i32:$soffset, u16imm:$offset)),
-    (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
+    (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, 0)
   >;
 
   def : GCNPat <
     (st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset,
                                        u16imm:$offset)),
-    (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
+    (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, 0)
   >;
 }
 
@@ -1726,7 +1780,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
     (!cast<MTBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
       (as_i8timm $format),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
-      (extract_swz $auxiliary))
+      (extract_swz $auxiliary), (extract_sccb $auxiliary))
   >;
 
   def : GCNPat<
@@ -1735,7 +1789,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
     (!cast<MTBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
       (as_i8timm $format),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
-      (extract_swz $auxiliary))
+      (extract_swz $auxiliary), (extract_sccb $auxiliary))
   >;
 
   def : GCNPat<
@@ -1744,7 +1798,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
     (!cast<MTBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
       (as_i8timm $format),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
-      (extract_swz $auxiliary))
+      (extract_swz $auxiliary), (extract_sccb $auxiliary))
   >;
 
   def : GCNPat<
@@ -1755,7 +1809,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
       SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
       (as_i8timm $format),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
-      (extract_swz $auxiliary))
+      (extract_swz $auxiliary), (extract_sccb $auxiliary))
   >;
 }
 
@@ -1794,7 +1848,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
     (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset,
       (as_i16timm $offset), (as_i8timm $format),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
-      (extract_swz $auxiliary))
+      (extract_swz $auxiliary), (extract_sccb $auxiliary))
   >;
 
   def : GCNPat<
@@ -1803,7 +1857,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
     (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
       (as_i16timm $offset), (as_i8timm $format),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
-      (extract_swz $auxiliary))
+      (extract_swz $auxiliary), (extract_sccb $auxiliary))
   >;
 
   def : GCNPat<
@@ -1812,7 +1866,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
     (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
       (as_i16timm $offset), (as_i8timm $format),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
-      (extract_swz $auxiliary))
+      (extract_swz $auxiliary), (extract_sccb $auxiliary))
   >;
 
   def : GCNPat<
@@ -1823,7 +1877,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
       (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
       SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
-      (extract_swz $auxiliary))
+      (extract_swz $auxiliary), (extract_sccb $auxiliary))
   >;
 }
 
@@ -1870,7 +1924,7 @@ class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> :
   let Inst{24-18} = op;
   let Inst{31-26} = 0x38;
   let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
-  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
   let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
   let Inst{54}    = !if(ps.has_slc, slc, ?);
   let Inst{55}    = !if(ps.has_tfe, tfe, ?);
@@ -2124,7 +2178,7 @@ class Base_MTBUF_Real_gfx6_gfx7_gfx10<bits<3> op, MTBUF_Pseudo ps, int ef> :
   let Inst{18-16} = op;
   let Inst{31-26} = 0x3a; //encoding
   let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
-  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
   let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
   let Inst{54}    = !if(ps.has_slc, slc, ?);
   let Inst{55}    = !if(ps.has_tfe, tfe, ?);
@@ -2206,33 +2260,53 @@ defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x007>;
 // GFX8, GFX9 (VI).
 //===----------------------------------------------------------------------===//
 
-class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps> :
+class MUBUF_Real_Base_vi <bits<7> op, MUBUF_Pseudo ps, int Enc> :
   MUBUF_Real<ps>,
   Enc64,
-  SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
-  let AssemblerPredicate = isGFX8GFX9;
-  let DecoderNamespace = "GFX8";
+  SIMCInstr<ps.PseudoInstr, Enc> {
 
   let Inst{11-0}  = !if(ps.has_offset, offset, ?);
   let Inst{12}    = ps.offen;
   let Inst{13}    = ps.idxen;
   let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
+  let Inst{15}    = !if(ps.has_sccb, sccb, ps.sccb_value);
   let Inst{16}    = ps.lds;
   let Inst{17}    = !if(ps.has_slc, slc, ?);
   let Inst{24-18} = op;
   let Inst{31-26} = 0x38; //encoding
   let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
-  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
   let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
-  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
   let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
 }
 
+class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps> :
+  MUBUF_Real_Base_vi<op, ps, SIEncodingFamily.VI> {
+  let AssemblerPredicate = isGFX8GFX9NotGFX90A;
+  let DecoderNamespace = "GFX8";
+
+  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
+}
+
+class MUBUF_Real_gfx90a <bits<7> op, MUBUF_Pseudo ps> :
+  MUBUF_Real_Base_vi<op, ps, SIEncodingFamily.GFX90A> {
+  let AssemblerPredicate = isGFX90APlus;
+  let DecoderNamespace = "GFX90A";
+  let AsmString = ps.Mnemonic # !subst("$tfe", "", ps.AsmOperands);
+
+  let Inst{55}    = acc;
+}
+
+multiclass MUBUF_Real_vi_gfx90a<bits<7> op, MUBUF_Pseudo ps> {
+  def _vi :     MUBUF_Real_vi<op, ps>;
+  def _gfx90a : MUBUF_Real_gfx90a<op, ps>;
+}
+
 multiclass MUBUF_Real_AllAddr_vi<bits<7> op> {
-  def _OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
-  def _OFFEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
-  def _IDXEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
-  def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+  defm _OFFSET : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+  defm _OFFEN  : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+  defm _IDXEN  : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+  defm _BOTHEN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
 }
 
 multiclass MUBUF_Real_AllAddr_Lds_vi<bits<7> op> {
@@ -2254,6 +2328,24 @@ multiclass MUBUF_Real_AllAddr_Lds_vi<bits<7> op> {
                        MUBUFLdsTable<1, NAME # "_IDXEN_vi">;
   def _LDS_BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
                        MUBUFLdsTable<1, NAME # "_BOTHEN_vi">;
+
+  def _OFFSET_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
+                   MUBUFLdsTable<0, NAME # "_OFFSET_gfx90a">;
+  def _OFFEN_gfx90a  : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
+                   MUBUFLdsTable<0, NAME # "_OFFEN_gfx90a">;
+  def _IDXEN_gfx90a  : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
+                   MUBUFLdsTable<0, NAME # "_IDXEN_gfx90a">;
+  def _BOTHEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
+                   MUBUFLdsTable<0, NAME # "_BOTHEN_gfx90a">;
+
+  def _LDS_OFFSET_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
+                       MUBUFLdsTable<1, NAME # "_OFFSET_gfx90a">;
+  def _LDS_OFFEN_gfx90a  : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
+                       MUBUFLdsTable<1, NAME # "_OFFEN_gfx90a">;
+  def _LDS_IDXEN_gfx90a  : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
+                       MUBUFLdsTable<1, NAME # "_IDXEN_gfx90a">;
+  def _LDS_BOTHEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
+                       MUBUFLdsTable<1, NAME # "_BOTHEN_gfx90a">;
 }
 
 class MUBUF_Real_gfx80 <bits<7> op, MUBUF_Pseudo ps> :
@@ -2272,7 +2364,7 @@ class MUBUF_Real_gfx80 <bits<7> op, MUBUF_Pseudo ps> :
   let Inst{24-18} = op;
   let Inst{31-26} = 0x38; //encoding
   let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
-  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
   let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
   let Inst{55}    = !if(ps.has_tfe, tfe, ?);
   let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
@@ -2287,10 +2379,10 @@ multiclass MUBUF_Real_AllAddr_gfx80<bits<7> op> {
 
 multiclass MUBUF_Real_Atomic_vi<bits<7> op> :
   MUBUF_Real_AllAddr_vi<op> {
-  def _OFFSET_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
-  def _OFFEN_RTN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>;
-  def _IDXEN_RTN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>;
-  def _BOTHEN_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
+  defm _OFFSET_RTN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
+  defm _OFFEN_RTN  : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>;
+  defm _IDXEN_RTN  : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>;
+  defm _BOTHEN_RTN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
 }
 
 defm BUFFER_LOAD_FORMAT_X       : MUBUF_Real_AllAddr_Lds_vi <0x00>;
@@ -2376,24 +2468,34 @@ defm BUFFER_ATOMIC_XOR_X2       : MUBUF_Real_Atomic_vi <0x6a>;
 defm BUFFER_ATOMIC_INC_X2       : MUBUF_Real_Atomic_vi <0x6b>;
 defm BUFFER_ATOMIC_DEC_X2       : MUBUF_Real_Atomic_vi <0x6c>;
 
-def BUFFER_STORE_LDS_DWORD_vi   : MUBUF_Real_vi <0x3d, BUFFER_STORE_LDS_DWORD>;
+defm BUFFER_STORE_LDS_DWORD     : MUBUF_Real_vi_gfx90a <0x3d, BUFFER_STORE_LDS_DWORD>;
 
+let AssemblerPredicate = isGFX8GFX9 in {
 def BUFFER_WBINVL1_vi           : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>;
 def BUFFER_WBINVL1_VOL_vi       : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>;
+} // End AssemblerPredicate = isGFX8GFX9
 
 let SubtargetPredicate = HasAtomicFaddInsts in {
 
-defm BUFFER_ATOMIC_ADD_F32    : MUBUF_Real_AllAddr_vi <0x4d>;
-defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_AllAddr_vi <0x4e>;
+defm BUFFER_ATOMIC_ADD_F32    : MUBUF_Real_Atomic_vi <0x4d>;
+defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_vi <0x4e>;
 
 } // End SubtargetPredicate = HasAtomicFaddInsts
 
-class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
+let SubtargetPredicate = isGFX90APlus, AssemblerPredicate = isGFX90APlus in {
+  defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_vi<0x4f>;
+  defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Real_Atomic_vi<0x50>;
+  defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Real_Atomic_vi<0x51>;
+} // End SubtargetPredicate = isGFX90APlus, AssemblerPredicate = isGFX90APlus
+
+def BUFFER_WBL2_gfx90a  : MUBUF_Real_gfx90a<0x28, BUFFER_WBL2> {
+}
+def BUFFER_INVL2_gfx90a : MUBUF_Real_gfx90a<0x29, BUFFER_INVL2>;
+
+class MTBUF_Real_Base_vi <bits<4> op, MTBUF_Pseudo ps, int Enc> :
   MTBUF_Real<ps>,
   Enc64,
-  SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
-  let AssemblerPredicate = isGFX8GFX9;
-  let DecoderNamespace = "GFX8";
+  SIMCInstr<ps.PseudoInstr, Enc> {
 
   let Inst{11-0}  = !if(ps.has_offset, offset, ?);
   let Inst{12}    = ps.offen;
@@ -2404,18 +2506,40 @@ class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
   let Inst{25-23} = nfmt;
   let Inst{31-26} = 0x3a; //encoding
   let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
-  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
   let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+  let Inst{53}    = !if(ps.has_sccb, sccb, ps.sccb_value);
   let Inst{54}    = !if(ps.has_slc, slc, ?);
-  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
   let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
 }
 
+class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
+  MTBUF_Real_Base_vi <op, ps, SIEncodingFamily.VI> {
+  let AssemblerPredicate = isGFX8GFX9NotGFX90A;
+  let DecoderNamespace = "GFX8";
+
+  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
+}
+
+class MTBUF_Real_gfx90a <bits<4> op, MTBUF_Pseudo ps> :
+  MTBUF_Real_Base_vi <op, ps, SIEncodingFamily.GFX90A> {
+  let AssemblerPredicate = isGFX90APlus;
+  let DecoderNamespace = "GFX90A";
+  let AsmString = ps.Mnemonic # !subst("$tfe", "", ps.AsmOperands);
+
+  let Inst{55}    = acc;
+}
+
+multiclass MTBUF_Real_vi_gfx90a<bits<4> op, MTBUF_Pseudo ps> {
+  def _vi :     MTBUF_Real_vi<op, ps>;
+  def _gfx90a : MTBUF_Real_gfx90a<op, ps>;
+}
+
 multiclass MTBUF_Real_AllAddr_vi<bits<4> op> {
-  def _OFFSET_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
-  def _OFFEN_vi  : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
-  def _IDXEN_vi  : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
-  def _BOTHEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+  defm _OFFSET : MTBUF_Real_vi_gfx90a <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+  defm _OFFEN  : MTBUF_Real_vi_gfx90a <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+  defm _IDXEN  : MTBUF_Real_vi_gfx90a <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+  defm _BOTHEN : MTBUF_Real_vi_gfx90a <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
 }
 
 class MTBUF_Real_gfx80 <bits<4> op, MTBUF_Pseudo ps> :
@@ -2434,7 +2558,7 @@ class MTBUF_Real_gfx80 <bits<4> op, MTBUF_Pseudo ps> :
   let Inst{25-23} = nfmt;
   let Inst{31-26} = 0x3a; //encoding
   let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
-  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
   let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
   let Inst{54}    = !if(ps.has_slc, slc, ?);
   let Inst{55}    = !if(ps.has_tfe, tfe, ?);

diff  --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 9f71017d81df..c856b2a0bbd2 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -67,17 +67,20 @@ class DS_Real <DS_Pseudo ds> :
   let AsmMatchConverter  = ds.AsmMatchConverter;
 
   // encoding fields
-  bits<8> vdst;
+  bits<10> vdst;
   bits<1> gds;
   bits<8> addr;
-  bits<8> data0;
-  bits<8> data1;
+  bits<10> data0;
+  bits<10> data1;
   bits<8> offset0;
   bits<8> offset1;
 
   bits<16> offset;
   let offset0 = !if(ds.has_offset, offset{7-0}, ?);
   let offset1 = !if(ds.has_offset, offset{15-8}, ?);
+
+  bits<1> acc = !if(ds.has_vdst, vdst{9},
+                    !if(!or(ds.has_data0, ds.has_gws_data0), data0{9}, 0));
 }
 
 
@@ -86,7 +89,7 @@ class DS_Real <DS_Pseudo ds> :
 class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32>
 : DS_Pseudo<opName,
   (outs),
-  (ins rc:$data0, offset:$offset, gds:$gds),
+  (ins getLdStRegisterOperand<rc>.ret:$data0, offset:$offset, gds:$gds),
   " $data0$offset$gds"> {
 
   let has_addr = 0;
@@ -97,7 +100,7 @@ class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32>
 class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32>
 : DS_Pseudo<opName,
   (outs),
-  (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
+  (ins VGPR_32:$addr, getLdStRegisterOperand<rc>.ret:$data0, offset:$offset, gds:$gds),
   " $addr, $data0$offset$gds"> {
 
   let has_data1 = 0;
@@ -115,10 +118,18 @@ multiclass DS_1A1D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
   }
 }
 
-class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32>
+multiclass DS_1A1D_NORET_mc_gfx9<string opName, RegisterClass rc = VGPR_32> {
+  let has_m0_read = 0 in {
+    def "" : DS_1A1D_NORET<opName, rc>,
+                AtomicNoRet<opName, 0>;
+  }
+}
+
+class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32,
+                    RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
 : DS_Pseudo<opName,
   (outs),
-  (ins VGPR_32:$addr, rc:$data0, rc:$data1, offset:$offset, gds:$gds),
+  (ins VGPR_32:$addr, data_op:$data0, data_op:$data1, offset:$offset, gds:$gds),
   " $addr, $data0, $data1$offset$gds"> {
 
   let has_vdst = 0;
@@ -135,10 +146,11 @@ multiclass DS_1A2D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
   }
 }
 
-class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32>
+class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32,
+                          RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
 : DS_Pseudo<opName,
   (outs),
-  (ins VGPR_32:$addr, rc:$data0, rc:$data1,
+  (ins VGPR_32:$addr, data_op:$data0, data_op:$data1,
        offset0:$offset0, offset1:$offset1, gds:$gds),
   " $addr, $data0, $data1$offset0$offset1$gds"> {
 
@@ -155,10 +167,11 @@ multiclass DS_1A2D_Off8_NORET_mc <string opName, RegisterClass rc = VGPR_32> {
   }
 }
 
-class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32>
+class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32,
+                  RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
 : DS_Pseudo<opName,
-  (outs rc:$vdst),
-  (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
+  (outs data_op:$vdst),
+  (ins VGPR_32:$addr, data_op:$data0, offset:$offset, gds:$gds),
   " $vdst, $addr, $data0$offset$gds"> {
 
   let hasPostISelHook = 1;
@@ -178,12 +191,23 @@ multiclass DS_1A1D_RET_mc <string opName, RegisterClass rc = VGPR_32,
   }
 }
 
+multiclass DS_1A1D_RET_mc_gfx9 <string opName, RegisterClass rc = VGPR_32,
+                                string NoRetOp = ""> {
+  let has_m0_read = 0 in {
+    def "" : DS_1A1D_RET<opName, rc>,
+      AtomicNoRet<!if(!eq(NoRetOp, ""), "", NoRetOp),
+                  !if(!eq(NoRetOp, ""), 0, 1)>;
+  }
+}
+
 class DS_1A2D_RET<string opName,
                   RegisterClass rc = VGPR_32,
-                  RegisterClass src = rc>
+                  RegisterClass src = rc,
+                  RegisterOperand dst_op = getLdStRegisterOperand<rc>.ret,
+                  RegisterOperand src_op = getLdStRegisterOperand<src>.ret>
 : DS_Pseudo<opName,
-  (outs rc:$vdst),
-  (ins VGPR_32:$addr, src:$data0, src:$data1, offset:$offset, gds:$gds),
+  (outs dst_op:$vdst),
+  (ins VGPR_32:$addr, src_op:$data0, src_op:$data1, offset:$offset, gds:$gds),
   " $vdst, $addr, $data0, $data1$offset$gds"> {
 
   let hasPostISelHook = 1;
@@ -205,10 +229,12 @@ multiclass DS_1A2D_RET_mc<string opName,
 
 class DS_1A2D_Off8_RET<string opName,
                        RegisterClass rc = VGPR_32,
-                       RegisterClass src = rc>
+                       RegisterClass src = rc,
+                       RegisterOperand dst_op = getLdStRegisterOperand<rc>.ret,
+                       RegisterOperand src_op = getLdStRegisterOperand<src>.ret>
 : DS_Pseudo<opName,
-  (outs rc:$vdst),
-  (ins VGPR_32:$addr, src:$data0, src:$data1, offset0:$offset0, offset1:$offset1, gds:$gds),
+  (outs dst_op:$vdst),
+  (ins VGPR_32:$addr, src_op:$data0, src_op:$data1, offset0:$offset0, offset1:$offset1, gds:$gds),
   " $vdst, $addr, $data0, $data1$offset0$offset1$gds"> {
 
   let has_offset = 0;
@@ -228,11 +254,12 @@ multiclass DS_1A2D_Off8_RET_mc<string opName,
 }
 
 
-class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = offset>
+class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = offset,
+                RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
 : DS_Pseudo<opName,
-  (outs rc:$vdst),
+  (outs data_op:$vdst),
   !if(HasTiedOutput,
-    (ins VGPR_32:$addr, ofs:$offset, gds:$gds, rc:$vdst_in),
+    (ins VGPR_32:$addr, ofs:$offset, gds:$gds, data_op:$vdst_in),
     (ins VGPR_32:$addr, ofs:$offset, gds:$gds)),
   " $vdst, $addr$offset$gds"> {
   let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
@@ -254,7 +281,7 @@ class DS_1A_RET_Tied<string opName, RegisterClass rc = VGPR_32> :
 
 class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32>
 : DS_Pseudo<opName,
-  (outs rc:$vdst),
+  (outs getLdStRegisterOperand<rc>.ret:$vdst),
   (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, gds:$gds),
   " $vdst, $addr$offset0$offset1$gds"> {
 
@@ -273,7 +300,7 @@ multiclass DS_1A_Off8_RET_mc <string opName, RegisterClass rc = VGPR_32> {
 }
 
 class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
-  (outs VGPR_32:$vdst),
+  (outs getLdStRegisterOperand<VGPR_32>.ret:$vdst),
   (ins VGPR_32:$addr, offset:$offset),
   " $vdst, $addr$offset gds"> {
 
@@ -285,7 +312,7 @@ class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
 }
 
 class DS_0A_RET <string opName> : DS_Pseudo<opName,
-  (outs VGPR_32:$vdst),
+  (outs getLdStRegisterOperand<VGPR_32>.ret:$vdst),
   (ins offset:$offset, gds:$gds),
   " $vdst$offset$gds"> {
 
@@ -340,7 +367,8 @@ class DS_GWS_0D <string opName>
 
 class DS_GWS_1D <string opName>
 : DS_GWS<opName,
-  (ins VGPR_32:$data0, offset:$offset), " $data0$offset gds"> {
+  (ins getLdStRegisterOperand<VGPR_32>.ret:$data0, offset:$offset),
+  " $data0$offset gds"> {
 
   let has_gws_data0 = 1;
   let hasSideEffects = 1;
@@ -364,10 +392,11 @@ class DS_VOID <string opName> : DS_Pseudo<opName,
   let has_gds = 0;
 }
 
-class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag>
+class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag,
+                       RegisterOperand data_op = getLdStRegisterOperand<VGPR_32>.ret>
 : DS_Pseudo<opName,
-  (outs VGPR_32:$vdst),
-  (ins VGPR_32:$addr, VGPR_32:$data0, offset:$offset),
+  (outs data_op:$vdst),
+  (ins VGPR_32:$addr, data_op:$data0, offset:$offset),
   " $vdst, $addr, $data0$offset",
   [(set i32:$vdst,
    (node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))] > {
@@ -424,6 +453,11 @@ def DS_WRITE_ADDTID_B32 : DS_0A1D_NORET<"ds_write_addtid_b32">;
 
 } // End mayLoad = 0
 
+let SubtargetPredicate = isGFX90APlus in {
+  defm DS_ADD_F64     : DS_1A1D_NORET_mc_gfx9<"ds_add_f64", VReg_64>;
+  defm DS_ADD_RTN_F64 : DS_1A1D_RET_mc_gfx9<"ds_add_rtn_f64 ", VReg_64, "ds_add_f64">;
+} // End SubtargetPredicate = isGFX90APlus
+
 defm DS_MSKOR_B32     : DS_1A2D_NORET_mc<"ds_mskor_b32">;
 defm DS_CMPST_B32     : DS_1A2D_NORET_mc<"ds_cmpst_b32">;
 defm DS_CMPST_F32     : DS_1A2D_NORET_mc<"ds_cmpst_f32">;
@@ -942,6 +976,10 @@ defm : DSAtomicRetPat_mc<DS_MAX_RTN_U64, i64, "atomic_load_umax">;
 
 defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B64, i64, "atomic_cmp_swap">;
 
+let SubtargetPredicate = isGFX90APlus in {
+def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_64>;
+}
+
 def : Pat <
   (SIds_ordered_count i32:$value, i16:$offset),
   (DS_ORDERED_COUNT $value, (as_i16imm $offset))
@@ -963,10 +1001,10 @@ class Base_DS_Real_gfx6_gfx7_gfx10<bits<8> op, DS_Pseudo ps, int ef> :
   let Inst{17}    = !if(ps.has_gds, gds, ps.gdsValue);
   let Inst{25-18} = op;
   let Inst{31-26} = 0x36;
-  let Inst{39-32} = !if(ps.has_addr, addr, !if(ps.has_gws_data0, data0, 0));
-  let Inst{47-40} = !if(ps.has_data0, data0, 0);
-  let Inst{55-48} = !if(ps.has_data1, data1, 0);
-  let Inst{63-56} = !if(ps.has_vdst, vdst, 0);
+  let Inst{39-32} = !if(ps.has_addr, addr, !if(ps.has_gws_data0, data0{7-0}, 0));
+  let Inst{47-40} = !if(ps.has_data0, data0{7-0}, 0);
+  let Inst{55-48} = !if(ps.has_data1, data1{7-0}, 0);
+  let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, 0);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1181,11 +1219,12 @@ class DS_Real_vi <bits<8> op, DS_Pseudo ds> :
   let Inst{15-8}  = !if(ds.has_offset1, offset1, 0);
   let Inst{16}    = !if(ds.has_gds, gds, ds.gdsValue);
   let Inst{24-17} = op;
+  let Inst{25}    = acc;
   let Inst{31-26} = 0x36; // ds prefix
-  let Inst{39-32} = !if(ds.has_addr, addr, !if(ds.has_gws_data0, data0, 0));
-  let Inst{47-40} = !if(ds.has_data0, data0, 0);
-  let Inst{55-48} = !if(ds.has_data1, data1, 0);
-  let Inst{63-56} = !if(ds.has_vdst, vdst, 0);
+  let Inst{39-32} = !if(ds.has_addr, addr, !if(ds.has_gws_data0, data0{7-0}, 0));
+  let Inst{47-40} = !if(ds.has_data0, data0{7-0}, 0);
+  let Inst{55-48} = !if(ds.has_data1, data1{7-0}, 0);
+  let Inst{63-56} = !if(ds.has_vdst, vdst{7-0}, 0);
 }
 
 def DS_ADD_U32_vi         : DS_Real_vi<0x0,  DS_ADD_U32>;
@@ -1348,3 +1387,8 @@ def DS_WRITE_B96_vi       : DS_Real_vi<0xde, DS_WRITE_B96>;
 def DS_WRITE_B128_vi      : DS_Real_vi<0xdf, DS_WRITE_B128>;
 def DS_READ_B96_vi        : DS_Real_vi<0xfe, DS_READ_B96>;
 def DS_READ_B128_vi       : DS_Real_vi<0xff, DS_READ_B128>;
+
+let SubtargetPredicate = isGFX90APlus in {
+  def DS_ADD_F64_vi     : DS_Real_vi<0x5c, DS_ADD_F64>;
+  def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>;
+} // End SubtargetPredicate = isGFX90APlus

diff  --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 8061c6c509e0..4b535f81ecbf 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -123,6 +123,7 @@ DECODE_OPERAND_REG(VReg_96)
 DECODE_OPERAND_REG(VReg_128)
 DECODE_OPERAND_REG(VReg_256)
 DECODE_OPERAND_REG(VReg_512)
+DECODE_OPERAND_REG(VReg_1024)
 
 DECODE_OPERAND_REG(SReg_32)
 DECODE_OPERAND_REG(SReg_32_XM0_XEXEC)
@@ -135,7 +136,9 @@ DECODE_OPERAND_REG(SReg_256)
 DECODE_OPERAND_REG(SReg_512)
 
 DECODE_OPERAND_REG(AGPR_32)
+DECODE_OPERAND_REG(AReg_64)
 DECODE_OPERAND_REG(AReg_128)
+DECODE_OPERAND_REG(AReg_256)
 DECODE_OPERAND_REG(AReg_512)
 DECODE_OPERAND_REG(AReg_1024)
 DECODE_OPERAND_REG(AV_32)
@@ -157,6 +160,14 @@ static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst,
   return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm));
 }
 
+static DecodeStatus decodeOperand_VSrcV232(MCInst &Inst,
+                                           unsigned Imm,
+                                           uint64_t Addr,
+                                           const void *Decoder) {
+  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+  return addOperand(Inst, DAsm->decodeOperand_VSrcV232(Imm));
+}
+
 static DecodeStatus decodeOperand_VS_16(MCInst &Inst,
                                         unsigned Imm,
                                         uint64_t Addr,
@@ -173,6 +184,14 @@ static DecodeStatus decodeOperand_VS_32(MCInst &Inst,
   return addOperand(Inst, DAsm->decodeOperand_VS_32(Imm));
 }
 
+static DecodeStatus decodeOperand_AReg_64(MCInst &Inst,
+                                          unsigned Imm,
+                                          uint64_t Addr,
+                                          const void *Decoder) {
+  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+  return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm | 512));
+}
+
 static DecodeStatus decodeOperand_AReg_128(MCInst &Inst,
                                            unsigned Imm,
                                            uint64_t Addr,
@@ -181,6 +200,14 @@ static DecodeStatus decodeOperand_AReg_128(MCInst &Inst,
   return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm | 512));
 }
 
+static DecodeStatus decodeOperand_AReg_256(MCInst &Inst,
+                                           unsigned Imm,
+                                           uint64_t Addr,
+                                           const void *Decoder) {
+  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+  return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW256, Imm | 512));
+}
+
 static DecodeStatus decodeOperand_AReg_512(MCInst &Inst,
                                            unsigned Imm,
                                            uint64_t Addr,
@@ -197,6 +224,127 @@ static DecodeStatus decodeOperand_AReg_1024(MCInst &Inst,
   return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm | 512));
 }
 
+static DecodeStatus decodeOperand_VReg_64(MCInst &Inst,
+                                          unsigned Imm,
+                                          uint64_t Addr,
+                                          const void *Decoder) {
+  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+  return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm));
+}
+
+static DecodeStatus decodeOperand_VReg_128(MCInst &Inst,
+                                           unsigned Imm,
+                                           uint64_t Addr,
+                                           const void *Decoder) {
+  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+  return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm));
+}
+
+static DecodeStatus decodeOperand_VReg_256(MCInst &Inst,
+                                           unsigned Imm,
+                                           uint64_t Addr,
+                                           const void *Decoder) {
+  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+  return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW256, Imm));
+}
+
+static DecodeStatus decodeOperand_VReg_512(MCInst &Inst,
+                                           unsigned Imm,
+                                           uint64_t Addr,
+                                           const void *Decoder) {
+  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+  return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW512, Imm));
+}
+
+static DecodeStatus decodeOperand_VReg_1024(MCInst &Inst,
+                                            unsigned Imm,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+  return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm));
+}
+
+static bool IsAGPROperand(const MCInst &Inst, int OpIdx,
+                          const MCRegisterInfo *MRI) {
+  if (OpIdx < 0)
+    return false;
+
+  const MCOperand &Op = Inst.getOperand(OpIdx);
+  if (!Op.isReg())
+    return false;
+
+  unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
+  auto Reg = Sub ? Sub : Op.getReg();
+  return Reg >= AMDGPU::AGPR0 && Reg <= AMDGPU::AGPR255;
+}
+
+static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst,
+                                             unsigned Imm,
+                                             AMDGPUDisassembler::OpWidthTy Opw,
+                                             const void *Decoder) {
+  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+  if (!DAsm->isGFX90A()) {
+    Imm &= 511;
+  } else {
+    // If atomic has both vdata and vdst their register classes are tied.
+    // The bit is decoded along with the vdst, first operand. We need to
+    // change register class to AGPR if vdst was AGPR.
+    // If a DS instruction has both data0 and data1 their register classes
+    // are also tied.
+    unsigned Opc = Inst.getOpcode();
+    uint64_t TSFlags = DAsm->getMCII()->get(Opc).TSFlags;
+    uint16_t DataNameIdx = (TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
+                                                        : AMDGPU::OpName::vdata;
+    const MCRegisterInfo *MRI = DAsm->getContext().getRegisterInfo();
+    int DataIdx = AMDGPU::getNamedOperandIdx(Opc, DataNameIdx);
+    if ((int)Inst.getNumOperands() == DataIdx) {
+      int DstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+      if (IsAGPROperand(Inst, DstIdx, MRI))
+        Imm |= 512;
+    }
+
+    if (TSFlags & SIInstrFlags::DS) {
+      int Data2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
+      if ((int)Inst.getNumOperands() == Data2Idx &&
+          IsAGPROperand(Inst, DataIdx, MRI))
+        Imm |= 512;
+    }
+  }
+  return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256));
+}
+
+static DecodeStatus DecodeAVLdSt_32RegisterClass(MCInst &Inst,
+                                                 unsigned Imm,
+                                                 uint64_t Addr,
+                                                 const void *Decoder) {
+  return decodeOperand_AVLdSt_Any(Inst, Imm,
+                                  AMDGPUDisassembler::OPW32, Decoder);
+}
+
+static DecodeStatus DecodeAVLdSt_64RegisterClass(MCInst &Inst,
+                                                 unsigned Imm,
+                                                 uint64_t Addr,
+                                                 const void *Decoder) {
+  return decodeOperand_AVLdSt_Any(Inst, Imm,
+                                  AMDGPUDisassembler::OPW64, Decoder);
+}
+
+static DecodeStatus DecodeAVLdSt_96RegisterClass(MCInst &Inst,
+                                                 unsigned Imm,
+                                                 uint64_t Addr,
+                                                 const void *Decoder) {
+  return decodeOperand_AVLdSt_Any(Inst, Imm,
+                                  AMDGPUDisassembler::OPW96, Decoder);
+}
+
+static DecodeStatus DecodeAVLdSt_128RegisterClass(MCInst &Inst,
+                                                  unsigned Imm,
+                                                  uint64_t Addr,
+                                                  const void *Decoder) {
+  return decodeOperand_AVLdSt_Any(Inst, Imm,
+                                  AMDGPUDisassembler::OPW128, Decoder);
+}
+
 static DecodeStatus decodeOperand_SReg_32(MCInst &Inst,
                                           unsigned Imm,
                                           uint64_t Addr,
@@ -341,6 +489,12 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address);
     if (Res) break;
 
+    if (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]) {
+      Res = tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address);
+      if (Res)
+        break;
+    }
+
     if (STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]) {
       Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address);
       if (Res) break;
@@ -351,6 +505,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
     if (Bytes.size() < 4) break;
     const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW;
+
+    if (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]) {
+      Res = tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address);
+      if (Res)
+        break;
+    }
+
     Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address);
     if (Res) break;
 
@@ -369,6 +530,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
               MI.getOpcode() == AMDGPU::V_MAC_LEGACY_F32_e64_gfx6_gfx7 ||
               MI.getOpcode() == AMDGPU::V_MAC_LEGACY_F32_e64_gfx10 ||
               MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ||
+              MI.getOpcode() == AMDGPU::V_FMAC_F64_e64_gfx90a ||
               MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi ||
               MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_gfx10 ||
               MI.getOpcode() == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 ||
@@ -384,6 +546,44 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     insertNamedMCOperand(MI, MCOperand::createImm(1), AMDGPU::OpName::glc1);
   }
 
+  if (Res && (MCII->get(MI.getOpcode()).TSFlags &
+              (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) &&
+             (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts])) {
+    // GFX90A lost TFE, its place is occupied by ACC.
+    int TFEOpIdx =
+        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
+    if (TFEOpIdx != -1) {
+      auto TFEIter = MI.begin();
+      std::advance(TFEIter, TFEOpIdx);
+      MI.insert(TFEIter, MCOperand::createImm(0));
+    }
+  }
+
+  if (Res && (MCII->get(MI.getOpcode()).TSFlags &
+              (SIInstrFlags::FLAT |
+               SIInstrFlags::MTBUF | SIInstrFlags::MUBUF))) {
+    if (!isGFX10()) {
+      int DLCOpIdx =
+          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dlc);
+      if (DLCOpIdx != -1) {
+        auto DLCIter = MI.begin();
+        std::advance(DLCIter, DLCOpIdx);
+        MI.insert(DLCIter, MCOperand::createImm(0));
+      }
+    }
+  }
+
+  if (Res && (MCII->get(MI.getOpcode()).TSFlags &
+              (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF))) {
+    int SWZOpIdx =
+        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
+    if (SWZOpIdx != -1) {
+      auto SWZIter = MI.begin();
+      std::advance(SWZIter, SWZOpIdx);
+      MI.insert(SWZIter, MCOperand::createImm(0));
+    }
+  }
+
   if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) {
     int VAddr0Idx =
         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
@@ -545,7 +745,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
     DstSize = (DstSize + 1) / 2;
   }
 
-  if (MI.getOperand(TFEIdx).getImm())
+  if (TFEIdx != -1 && MI.getOperand(TFEIdx).getImm())
     DstSize += 1;
 
   if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords)
@@ -701,6 +901,10 @@ MCOperand AMDGPUDisassembler::decodeOperand_VSrcV216(unsigned Val) const {
   return decodeSrcOp(OPWV216, Val);
 }
 
+MCOperand AMDGPUDisassembler::decodeOperand_VSrcV232(unsigned Val) const {
+  return decodeSrcOp(OPWV232, Val);
+}
+
 MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const {
   // Some instructions have operand restrictions beyond what the encoding
   // allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra
@@ -718,10 +922,18 @@ MCOperand AMDGPUDisassembler::decodeOperand_AGPR_32(unsigned Val) const {
   return createRegOperand(AMDGPU::AGPR_32RegClassID, Val & 255);
 }
 
+MCOperand AMDGPUDisassembler::decodeOperand_AReg_64(unsigned Val) const {
+  return createRegOperand(AMDGPU::AReg_64RegClassID, Val & 255);
+}
+
 MCOperand AMDGPUDisassembler::decodeOperand_AReg_128(unsigned Val) const {
   return createRegOperand(AMDGPU::AReg_128RegClassID, Val & 255);
 }
 
+MCOperand AMDGPUDisassembler::decodeOperand_AReg_256(unsigned Val) const {
+  return createRegOperand(AMDGPU::AReg_256RegClassID, Val & 255);
+}
+
 MCOperand AMDGPUDisassembler::decodeOperand_AReg_512(unsigned Val) const {
   return createRegOperand(AMDGPU::AReg_512RegClassID, Val & 255);
 }
@@ -758,6 +970,10 @@ MCOperand AMDGPUDisassembler::decodeOperand_VReg_512(unsigned Val) const {
   return createRegOperand(AMDGPU::VReg_512RegClassID, Val);
 }
 
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_1024(unsigned Val) const {
+  return createRegOperand(AMDGPU::VReg_1024RegClassID, Val);
+}
+
 MCOperand AMDGPUDisassembler::decodeOperand_SReg_32(unsigned Val) const {
   // table-gen generated disassembler doesn't care about operand types
   // leaving only registry class so SSrc_32 operand turns into SReg_32
@@ -914,8 +1130,10 @@ MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) {
   case OPW128: // splat constants
   case OPW512:
   case OPW1024:
+  case OPWV232:
     return MCOperand::createImm(getInlineImmVal32(Imm));
   case OPW64:
+  case OPW256:
     return MCOperand::createImm(getInlineImmVal64(Imm));
   case OPW16:
   case OPWV216:
@@ -935,8 +1153,14 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
   case OPW16:
   case OPWV216:
     return VGPR_32RegClassID;
-  case OPW64: return VReg_64RegClassID;
+  case OPW64:
+  case OPWV232: return VReg_64RegClassID;
+  case OPW96: return VReg_96RegClassID;
   case OPW128: return VReg_128RegClassID;
+  case OPW160: return VReg_160RegClassID;
+  case OPW256: return VReg_256RegClassID;
+  case OPW512: return VReg_512RegClassID;
+  case OPW1024: return VReg_1024RegClassID;
   }
 }
 
@@ -950,8 +1174,11 @@ unsigned AMDGPUDisassembler::getAgprClassId(const OpWidthTy Width) const {
   case OPW16:
   case OPWV216:
     return AGPR_32RegClassID;
-  case OPW64: return AReg_64RegClassID;
+  case OPW64:
+  case OPWV232: return AReg_64RegClassID;
+  case OPW96: return AReg_96RegClassID;
   case OPW128: return AReg_128RegClassID;
+  case OPW160: return AReg_160RegClassID;
   case OPW256: return AReg_256RegClassID;
   case OPW512: return AReg_512RegClassID;
   case OPW1024: return AReg_1024RegClassID;
@@ -969,8 +1196,11 @@ unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
   case OPW16:
   case OPWV216:
     return SGPR_32RegClassID;
-  case OPW64: return SGPR_64RegClassID;
+  case OPW64:
+  case OPWV232: return SGPR_64RegClassID;
+  case OPW96: return SGPR_96RegClassID;
   case OPW128: return SGPR_128RegClassID;
+  case OPW160: return SGPR_160RegClassID;
   case OPW256: return SGPR_256RegClassID;
   case OPW512: return SGPR_512RegClassID;
   }
@@ -986,7 +1216,8 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
   case OPW16:
   case OPWV216:
     return TTMP_32RegClassID;
-  case OPW64: return TTMP_64RegClassID;
+  case OPW64:
+  case OPWV232: return TTMP_64RegClassID;
   case OPW128: return TTMP_128RegClassID;
   case OPW256: return TTMP_256RegClassID;
   case OPW512: return TTMP_512RegClassID;
@@ -1040,6 +1271,7 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c
   case OPWV216:
     return decodeSpecialReg32(Val);
   case OPW64:
+  case OPWV232:
     return decodeSpecialReg64(Val);
   default:
     llvm_unreachable("unexpected immediate type");
@@ -1209,6 +1441,10 @@ bool AMDGPUDisassembler::isVI() const {
 
 bool AMDGPUDisassembler::isGFX9() const { return AMDGPU::isGFX9(STI); }
 
+bool AMDGPUDisassembler::isGFX90A() const {
+  return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts];
+}
+
 bool AMDGPUDisassembler::isGFX9Plus() const { return AMDGPU::isGFX9Plus(STI); }
 
 bool AMDGPUDisassembler::isGFX10() const { return AMDGPU::isGFX10(STI); }

diff  --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 714dabbc5184..7c6d3afbe788 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -99,12 +99,14 @@ class AMDGPUDisassembler : public MCDisassembler {
   MCOperand decodeOperand_VS_128(unsigned Val) const;
   MCOperand decodeOperand_VSrc16(unsigned Val) const;
   MCOperand decodeOperand_VSrcV216(unsigned Val) const;
+  MCOperand decodeOperand_VSrcV232(unsigned Val) const;
 
   MCOperand decodeOperand_VReg_64(unsigned Val) const;
   MCOperand decodeOperand_VReg_96(unsigned Val) const;
   MCOperand decodeOperand_VReg_128(unsigned Val) const;
   MCOperand decodeOperand_VReg_256(unsigned Val) const;
   MCOperand decodeOperand_VReg_512(unsigned Val) const;
+  MCOperand decodeOperand_VReg_1024(unsigned Val) const;
 
   MCOperand decodeOperand_SReg_32(unsigned Val) const;
   MCOperand decodeOperand_SReg_32_XM0_XEXEC(unsigned Val) const;
@@ -117,7 +119,9 @@ class AMDGPUDisassembler : public MCDisassembler {
   MCOperand decodeOperand_SReg_512(unsigned Val) const;
 
   MCOperand decodeOperand_AGPR_32(unsigned Val) const;
+  MCOperand decodeOperand_AReg_64(unsigned Val) const;
   MCOperand decodeOperand_AReg_128(unsigned Val) const;
+  MCOperand decodeOperand_AReg_256(unsigned Val) const;
   MCOperand decodeOperand_AReg_512(unsigned Val) const;
   MCOperand decodeOperand_AReg_1024(unsigned Val) const;
   MCOperand decodeOperand_AV_32(unsigned Val) const;
@@ -126,12 +130,15 @@ class AMDGPUDisassembler : public MCDisassembler {
   enum OpWidthTy {
     OPW32,
     OPW64,
+    OPW96,
     OPW128,
+    OPW160,
     OPW256,
     OPW512,
     OPW1024,
     OPW16,
     OPWV216,
+    OPWV232,
     OPW_LAST_,
     OPW_FIRST_ = OPW32
   };
@@ -159,8 +166,11 @@ class AMDGPUDisassembler : public MCDisassembler {
 
   int getTTmpIdx(unsigned Val) const;
 
+  const MCInstrInfo *getMCII() const { return MCII.get(); }
+
   bool isVI() const;
   bool isGFX9() const;
+  bool isGFX90A() const;
   bool isGFX9Plus() const;
   bool isGFX10() const;
   bool isGFX10Plus() const;

diff  --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 7a9c36e72d74..ee43d12865ef 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -54,6 +54,8 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
   bits<1> glcValue = 0;
   bits<1> has_dlc  = 1;
   bits<1> dlcValue = 0;
+  bits<1> has_sccb  = 1;
+  bits<1> sccbValue = 0;
 
   let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts,
     !if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace));
@@ -88,14 +90,17 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
 
   // encoding fields
   bits<8> vaddr;
-  bits<8> vdata;
+  bits<10> vdata;
   bits<7> saddr;
-  bits<8> vdst;
+  bits<10> vdst;
 
   bits<1> slc;
   bits<1> glc;
   bits<1> dlc;
 
+  // Only valid on gfx90a+
+  bits<1> sccb;
+
   // Only valid on gfx9
   bits<1> lds = 0; // XXX - What does this actually do?
 
@@ -106,7 +111,8 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
   // Signed offset. Highest bit ignored for flat and treated as 12-bit
   // unsigned for flat accesses.
   bits<13> offset;
-  bits<1> nv = 0; // XXX - What does this actually do?
+  // GFX90A+ only: instruction uses AccVGPR for data
+  bits<1> acc = !if(ps.has_vdst, vdst{9}, !if(ps.has_data, vdata{9}, 0));
 
   // We don't use tfe right now, and it was removed in gfx9.
   bits<1> tfe = 0;
@@ -121,12 +127,12 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
   let Inst{24-18} = op;
   let Inst{31-26} = 0x37; // Encoding.
   let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
-  let Inst{47-40} = !if(ps.has_data, vdata, ?);
+  let Inst{47-40} = !if(ps.has_data, vdata{7-0}, ?);
   let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7f), 0);
 
   // 54-48 is reserved.
-  let Inst{55}    = nv; // nv on GFX9+, TFE before.
-  let Inst{63-56} = !if(ps.has_vdst, vdst, ?);
+  let Inst{55}    = acc; // nv on GFX9+, TFE before. AccVGPR for data on GFX90A.
+  let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, ?);
 }
 
 class GlobalSaddrTable <bit is_saddr, string Name = ""> {
@@ -139,9 +145,10 @@ class GlobalSaddrTable <bit is_saddr, string Name = ""> {
 // saddr is 32-bit (which isn't handled here yet).
 class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
   bit HasTiedOutput = 0,
-  bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
+  bit HasSaddr = 0, bit EnableSaddr = 0,
+  RegisterOperand vdata_op = getLdStRegisterOperand<regClass>.ret> : FLAT_Pseudo<
   opName,
-  (outs regClass:$vdst),
+  (outs vdata_op:$vdst),
   !con(
     !con(
       !if(EnableSaddr,
@@ -149,9 +156,9 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
         (ins VReg_64:$vaddr)),
         (ins flat_offset:$offset)),
         // FIXME: Operands with default values do not work with following non-optional operands.
-        !if(HasTiedOutput, (ins GLC:$glc, SLC:$slc, DLC:$dlc, regClass:$vdst_in),
-                           (ins GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc))),
-  " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> {
+        !if(HasTiedOutput, (ins GLC:$glc, SLC:$slc, DLC:$dlc, SCCB:$sccb, vdata_op:$vdst_in),
+                           (ins GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc, SCCB_0:$sccb))),
+  " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc$sccb"> {
   let has_data = 0;
   let mayLoad = 1;
   let has_saddr = HasSaddr;
@@ -169,10 +176,10 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
   (outs),
   !con(
     !if(EnableSaddr,
-      (ins VGPR_32:$vaddr, vdataClass:$vdata, SReg_64:$saddr),
-      (ins VReg_64:$vaddr, vdataClass:$vdata)),
-      (ins flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc)),
-  " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> {
+      (ins VGPR_32:$vaddr, getLdStRegisterOperand<vdataClass>.ret:$vdata, SReg_64:$saddr),
+      (ins VReg_64:$vaddr, getLdStRegisterOperand<vdataClass>.ret:$vdata)),
+      (ins flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc, SCCB_0:$sccb)),
+  " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc$sccb"> {
   let mayLoad  = 0;
   let mayStore = 1;
   let has_vdst = 0;
@@ -196,9 +203,9 @@ class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterClass regClass,
   opName,
   (outs regClass:$vdst),
   !con(!if(EnableSaddr, (ins SReg_64:$saddr), (ins)),
-    (ins flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc),
+    (ins flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc, SCCB_0:$sccb),
     !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))),
-  " $vdst, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
+  " $vdst, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc$sccb"> {
   let is_flat_global = 1;
   let has_data = 0;
   let mayLoad = 1;
@@ -234,8 +241,8 @@ class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass,
   opName,
   (outs),
   !con(!if(EnableSaddr, (ins vdataClass:$vdata, SReg_64:$saddr), (ins vdataClass:$vdata)),
-    (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
-  " $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
+    (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc, SCCB_0:$sccb)),
+  " $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc$sccb"> {
   let is_flat_global = 1;
   let mayLoad  = 0;
   let mayStore = 1;
@@ -266,16 +273,16 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
   bit EnableVaddr = !not(EnableSaddr)>
   : FLAT_Pseudo<
   opName,
-  (outs regClass:$vdst),
+  (outs getLdStRegisterOperand<regClass>.ret:$vdst),
   !con(
      !if(EnableSaddr,
        (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset),
        !if(EnableVaddr,
          (ins VGPR_32:$vaddr, flat_offset:$offset),
          (ins flat_offset:$offset))),
-     !if(HasTiedOutput, (ins GLC:$glc, SLC:$slc, DLC:$dlc, regClass:$vdst_in),
-                        (ins GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc))),
-  " $vdst, "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
+     !if(HasTiedOutput, (ins GLC:$glc, SLC:$slc, DLC:$dlc, SCCB:$sccb, getLdStRegisterOperand<regClass>.ret:$vdst_in),
+                        (ins GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc, SCCB_0:$sccb))),
+  " $vdst, "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc$sccb"> {
   let has_data = 0;
   let mayLoad = 1;
   let has_saddr = 1;
@@ -289,15 +296,16 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
 }
 
 class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit EnableSaddr = 0,
-  bit EnableVaddr = !not(EnableSaddr)> : FLAT_Pseudo<
+  bit EnableVaddr = !not(EnableSaddr),
+  RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret> : FLAT_Pseudo<
   opName,
   (outs),
   !if(EnableSaddr,
-    (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc),
+    (ins vdata_op:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc, SCCB_0:$sccb),
     !if(EnableVaddr,
-      (ins vdataClass:$vdata, VGPR_32:$vaddr, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc),
-      (ins vdataClass:$vdata, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc))),
-  " "#!if(EnableVaddr, "$vaddr", "off")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
+      (ins vdata_op:$vdata, VGPR_32:$vaddr, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc, SCCB_0:$sccb),
+      (ins vdata_op:$vdata, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc, SCCB_0:$sccb))),
+  " "#!if(EnableVaddr, "$vaddr", "off")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc$sccb"> {
   let mayLoad  = 0;
   let mayStore = 1;
   let has_vdst = 0;
@@ -344,6 +352,8 @@ class FLAT_AtomicNoRet_Pseudo<string opName, dag outs, dag ins,
     let has_dlc  = 0;
     let dlcValue = 0;
     let has_vdst = 0;
+    let has_sccb  = 1;
+    let sccbValue = 0;
     let maybeAtomic = 1;
     let IsAtomicNoRet = 1;
 }
@@ -355,6 +365,7 @@ class FLAT_AtomicRet_Pseudo<string opName, dag outs, dag ins,
   let has_vdst = 1;
   let glcValue = 1;
   let dlcValue = 0;
+  let sccbValue = 0;
   let IsAtomicNoRet = 0;
   let IsAtomicRet = 1;
   let PseudoInstr = NAME # "_RTN";
@@ -367,11 +378,12 @@ multiclass FLAT_Atomic_Pseudo<
   SDPatternOperator atomic = null_frag,
   ValueType data_vt = vt,
   RegisterClass data_rc = vdst_rc,
-  bit isFP = isFloatType<data_vt>.ret> {
+  bit isFP = isFloatType<data_vt>.ret,
+  RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
   def "" : FLAT_AtomicNoRet_Pseudo <opName,
     (outs),
-    (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC_0:$slc),
-    " $vaddr, $vdata$offset$slc">,
+    (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, SLC_0:$slc, SCCB_0:$sccb),
+    " $vaddr, $vdata$offset$slc$sccb">,
     GlobalSaddrTable<0, opName>,
     AtomicNoRet <opName, 0> {
     let PseudoInstr = NAME;
@@ -380,9 +392,9 @@ multiclass FLAT_Atomic_Pseudo<
   }
 
   def _RTN : FLAT_AtomicRet_Pseudo <opName,
-    (outs vdst_rc:$vdst),
-    (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc),
-    " $vdst, $vaddr, $vdata$offset$glc1$slc",
+    (outs getLdStRegisterOperand<vdst_rc>.ret:$vdst),
+    (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc, SCCB_0:$sccb),
+    " $vdst, $vaddr, $vdata$offset$glc1$slc$sccb",
     [(set vt:$vdst,
       (atomic (FLATOffset i64:$vaddr, i16:$offset), data_vt:$vdata))]>,
        GlobalSaddrTable<0, opName#"_rtn">,
@@ -399,12 +411,13 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
   SDPatternOperator atomic = null_frag,
   ValueType data_vt = vt,
   RegisterClass data_rc = vdst_rc,
-  bit isFP = isFloatType<data_vt>.ret> {
+  bit isFP = isFloatType<data_vt>.ret,
+  RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
 
   def "" : FLAT_AtomicNoRet_Pseudo <opName,
     (outs),
-    (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC_0:$slc),
-    " $vaddr, $vdata, off$offset$slc">,
+    (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, SLC_0:$slc, SCCB_0:$sccb),
+    " $vaddr, $vdata, off$offset$slc$sccb">,
     GlobalSaddrTable<0, opName>,
     AtomicNoRet <opName, 0> {
     let has_saddr = 1;
@@ -414,8 +427,8 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
 
   def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
     (outs),
-    (ins VGPR_32:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, SLC_0:$slc),
-    " $vaddr, $vdata, $saddr$offset$slc">,
+    (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, SLC_0:$slc, SCCB_0:$sccb),
+    " $vaddr, $vdata, $saddr$offset$slc$sccb">,
     GlobalSaddrTable<1, opName>,
     AtomicNoRet <opName#"_saddr", 0> {
     let has_saddr = 1;
@@ -432,12 +445,14 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
   SDPatternOperator atomic = null_frag,
   ValueType data_vt = vt,
   RegisterClass data_rc = vdst_rc,
-  bit isFP = isFloatType<data_vt>.ret> {
+  bit isFP = isFloatType<data_vt>.ret,
+  RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret,
+  RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> {
 
   def _RTN : FLAT_AtomicRet_Pseudo <opName,
-    (outs vdst_rc:$vdst),
-      (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc),
-    " $vdst, $vaddr, $vdata, off$offset$glc1$slc",
+    (outs vdst_op:$vdst),
+      (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc, SCCB_0:$sccb),
+    " $vdst, $vaddr, $vdata, off$offset$glc1$slc$sccb",
     [(set vt:$vdst,
       (atomic (FLATOffsetSigned i64:$vaddr, i16:$offset), data_vt:$vdata))]>,
       GlobalSaddrTable<0, opName#"_rtn">,
@@ -447,9 +462,9 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
   }
 
   def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
-    (outs vdst_rc:$vdst),
-      (ins VGPR_32:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc),
-    " $vdst, $vaddr, $vdata, $saddr$offset$glc1$slc">,
+    (outs vdst_op:$vdst),
+      (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc, SCCB_0:$sccb),
+    " $vdst, $vaddr, $vdata, $saddr$offset$glc1$slc$sccb">,
     GlobalSaddrTable<1, opName#"_rtn">,
     AtomicNoRet <opName#"_saddr", 1> {
      let has_saddr = 1;
@@ -608,6 +623,15 @@ defm FLAT_ATOMIC_FMAX_X2     : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
 
 } // End SubtargetPredicate = isGFX7GFX10
 
+let SubtargetPredicate = isGFX90APlus in {
+  defm FLAT_ATOMIC_ADD_F64   : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64, int_amdgcn_flat_atomic_fadd>;
+  defm FLAT_ATOMIC_MIN_F64   : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmin>;
+  defm FLAT_ATOMIC_MAX_F64   : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmax>;
+  defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>;
+  defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
+  defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
+} // End SubtargetPredicate = isGFX90APlus
+
 defm GLOBAL_LOAD_UBYTE    : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
 defm GLOBAL_LOAD_SBYTE    : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>;
 defm GLOBAL_LOAD_USHORT   : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>;
@@ -780,6 +804,15 @@ let OtherPredicates = [HasAtomicFaddInsts] in {
     "global_atomic_pk_add_f16", VGPR_32, v2f16
   >;
 } // End OtherPredicates = [HasAtomicFaddInsts]
+
+let OtherPredicates = [isGFX90APlus] in {
+  defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN <
+    "global_atomic_add_f32", VGPR_32, f32, int_amdgcn_global_atomic_fadd
+  >;
+  defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN <
+    "global_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_global_atomic_fadd
+  >;
+} // End OtherPredicates = [isGFX90APlus]
 } // End is_flat_global = 1
 
 //===----------------------------------------------------------------------===//
@@ -794,17 +827,17 @@ class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCN
 
 class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
   (node (FLATOffset (i64 VReg_64:$vaddr), i16:$offset), vt:$in),
-  (inst $vaddr, $offset, 0, 0, 0, $in)
+  (inst $vaddr, $offset, 0, 0, 0, 0, $in)
 >;
 
 class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
   (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset), vt:$in),
-  (inst $vaddr, $offset, 0, 0, 0, $in)
+  (inst $vaddr, $offset, 0, 0, 0, 0, $in)
 >;
 
 class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
   (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), vt:$in)),
-  (inst $saddr, $voffset, $offset, 0, 0, 0, $in)
+  (inst $saddr, $voffset, $offset, 0, 0, 0, 0, $in)
 >;
 
 class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -895,7 +928,7 @@ class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType
 
 class ScratchLoadSignedPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
   (node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset), vt:$in),
-  (inst $vaddr, $offset, 0, 0, 0, $in)
+  (inst $vaddr, $offset, 0, 0, 0, 0, $in)
 >;
 
 class ScratchStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -910,7 +943,7 @@ class ScratchLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType v
 
 class ScratchLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
   (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset), vt:$in)),
-  (inst $saddr, $offset, 0, 0, 0, $in)
+  (inst $saddr, $offset, 0, 0, 0, 0, $in)
 >;
 
 class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
@@ -1205,6 +1238,17 @@ defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_ADD_F32,    atomic_load_fadd_glo
 defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_PK_ADD_F16, atomic_load_fadd_v2f16_global_noret_32, v2f16>;
 }
 
+let OtherPredicates = [isGFX90APlus] in {
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F32",    atomic_load_fadd_global_32,       f32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", atomic_load_fadd_v2f16_global_32, v2f16>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64",    atomic_load_fadd_global_64,       f64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64",    atomic_load_fmin_global_64,       f64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64",    atomic_load_fmax_global_64,       f64>;
+def  : FlatSignedAtomicPat  <FLAT_ATOMIC_ADD_F64_RTN,    atomic_load_fadd_flat_64,         f64>;
+def  : FlatSignedAtomicPat  <FLAT_ATOMIC_MIN_F64_RTN,    atomic_load_fmin_flat_64,         f64>;
+def  : FlatSignedAtomicPat  <FLAT_ATOMIC_MAX_F64_RTN,    atomic_load_fmax_flat_64,         f64>;
+}
+
 } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
 
 let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {
@@ -1345,6 +1389,8 @@ class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps> :
   SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
   let AssemblerPredicate = isGFX8GFX9;
   let DecoderNamespace = "GFX8";
+
+  let Inst{25} = !if(ps.has_sccb, sccb, ps.sccbValue);
 }
 
 multiclass FLAT_Real_AllAddr_vi<bits<7> op> {
@@ -1492,6 +1538,14 @@ defm SCRATCH_STORE_DWORDX2      : FLAT_Real_AllAddr_vi <0x1d>;
 defm SCRATCH_STORE_DWORDX3      : FLAT_Real_AllAddr_vi <0x1e>;
 defm SCRATCH_STORE_DWORDX4      : FLAT_Real_AllAddr_vi <0x1f>;
 
+let SubtargetPredicate = isGFX90APlus in {
+  defm FLAT_ATOMIC_ADD_F64   : FLAT_Real_Atomics_vi<0x4f, FLAT_ATOMIC_ADD_F64>;
+  defm FLAT_ATOMIC_MIN_F64   : FLAT_Real_Atomics_vi<0x50, FLAT_ATOMIC_MIN_F64>;
+  defm FLAT_ATOMIC_MAX_F64   : FLAT_Real_Atomics_vi<0x51, FLAT_ATOMIC_MAX_F64>;
+  defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Real_Atomics_vi<0x4f>;
+  defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Real_Atomics_vi<0x50>;
+  defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Real_Atomics_vi<0x51>;
+} // End SubtargetPredicate = isGFX90APlus
 
 //===----------------------------------------------------------------------===//
 // GFX10.
@@ -1701,7 +1755,7 @@ defm SCRATCH_LOAD_SHORT_D16_HI  : FLAT_Real_ScratchAllAddr_gfx10<0x025>;
 
 let SubtargetPredicate = HasAtomicFaddInsts in {
 
-defm GLOBAL_ATOMIC_ADD_F32    : FLAT_Real_AllAddr_vi <0x04d>;
-defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Real_AllAddr_vi <0x04e>;
+defm GLOBAL_ATOMIC_ADD_F32    : FLAT_Global_Real_Atomics_vi <0x04d>;
+defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_vi <0x04e>;
 
 } // End SubtargetPredicate = HasAtomicFaddInsts

diff  --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index e4eacd101ce8..eb2610492480 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -137,7 +137,8 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
   case AMDGPU::IMPLICIT_DEF:
     return nullptr;
   case AMDGPU::COPY:
-  case AMDGPU::V_MOV_B32_e32: {
+  case AMDGPU::V_MOV_B32_e32:
+  case AMDGPU::V_MOV_B64_PSEUDO: {
     auto &Op1 = Def->getOperand(1);
     if (Op1.isImm())
       return &Op1;
@@ -151,7 +152,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
                                            MachineInstr &MovMI,
                                            RegSubRegPair CombOldVGPR,
                                            bool CombBCZ) const {
-  assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
+  assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp ||
+         MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
 
   auto OrigOp = OrigMI.getOpcode();
   auto DPPOp = getDPPOp(OrigOp);
@@ -174,7 +176,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
     const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
     if (OldIdx != -1) {
       assert(OldIdx == NumOperands);
-      assert(isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI));
+      assert(isOfRegClass(
+          CombOldVGPR,
+          *MRI->getRegClass(
+              TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg()),
+          *MRI));
       auto *Def = getVRegSubRegDef(CombOldVGPR, *MRI);
       DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef,
                      CombOldVGPR.SubReg);
@@ -325,8 +331,10 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
       return nullptr;
     }
     CombOldVGPR = getRegSubRegPair(*Src1);
-    if (!isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI)) {
-      LLVM_DEBUG(dbgs() << "  failed: src1 isn't a VGPR32 register\n");
+    auto MovDst = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
+    const TargetRegisterClass *RC = MRI->getRegClass(MovDst->getReg());
+    if (!isOfRegClass(CombOldVGPR, *RC, *MRI)) {
+      LLVM_DEBUG(dbgs() << "  failed: src1 has wrong register class\n");
       return nullptr;
     }
   }
@@ -346,7 +354,8 @@ bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName,
 }
 
 bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
-  assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
+  assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp ||
+         MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
   LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
 
   auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
@@ -362,6 +371,17 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
     return false;
   }
 
+  if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) {
+    auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl);
+    assert(DppCtrl && DppCtrl->isImm());
+    if (!AMDGPU::isLegal64BitDPPControl(DppCtrl->getImm())) {
+      LLVM_DEBUG(dbgs() << "  failed: 64 bit dpp move uses unsupported"
+                           " control value\n");
+      // Let it split, then control may become legal.
+      return false;
+    }
+  }
+
   auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
   assert(RowMaskOpnd && RowMaskOpnd->isImm());
   auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
@@ -430,8 +450,9 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
   auto CombOldVGPR = getRegSubRegPair(*OldOpnd);
   // try to reuse previous old reg if its undefined (IMPLICIT_DEF)
   if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef
+    const TargetRegisterClass *RC = MRI->getRegClass(DPPMovReg);
     CombOldVGPR = RegSubRegPair(
-      MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass));
+      MRI->createVirtualRegister(RC));
     auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(),
                              TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.Reg);
     DPPMIs.push_back(UndefInst.getInstr());
@@ -581,12 +602,17 @@ bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
         Changed = true;
         ++NumDPPMovsCombined;
       } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) {
-        auto Split = TII->expandMovDPP64(MI);
-        for (auto M : { Split.first, Split.second }) {
-          if (combineDPPMov(*M))
-            ++NumDPPMovsCombined;
+        if (ST.has64BitDPP() && combineDPPMov(MI)) {
+          Changed = true;
+          ++NumDPPMovsCombined;
+        } else {
+          auto Split = TII->expandMovDPP64(MI);
+          for (auto M : { Split.first, Split.second }) {
+            if (M && combineDPPMov(*M))
+              ++NumDPPMovsCombined;
+          }
+          Changed = true;
         }
-        Changed = true;
       }
     }
   }

diff  --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index ed1dc77bd545..0482865bd518 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -32,7 +32,7 @@ GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
   TRI(TII.getRegisterInfo()),
   ClauseUses(TRI.getNumRegUnits()),
   ClauseDefs(TRI.getNumRegUnits()) {
-  MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5;
+  MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
   TSchedModel.init(&ST);
 }
 
@@ -87,6 +87,25 @@ static bool isSMovRel(unsigned Opcode) {
   }
 }
 
+static bool isDGEMM(unsigned Opcode) {
+  return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
+         Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 ||
+         Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 ||
+         Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64;
+}
+
+static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
+  unsigned Opcode = MI.getOpcode();
+
+  if (!SIInstrInfo::isMAI(MI) ||
+      isDGEMM(Opcode) ||
+      Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
+      Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
+    return false;
+
+  return true;
+}
+
 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
                                     const MachineInstr &MI) {
   if (TII.isAlwaysGDS(MI.getOpcode()))
@@ -165,6 +184,11 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
     return HazardType;
 
+  if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
+       SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
+       SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
+    return HazardType;
+
   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
     return HazardType;
 
@@ -274,6 +298,11 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
   if (isRWLane(MI->getOpcode()))
     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
 
+  if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
+       SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
+       SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
+    WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
+
   if (MI->isInlineAsm())
     return std::max(WaitStates, checkInlineAsmHazards(MI));
 
@@ -603,7 +632,7 @@ int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
   const int VmemSgprWaitStates = 5;
   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
   for (const MachineOperand &Use : VMEM->uses()) {
-    if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
+    if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
       continue;
 
     int WaitStatesNeededForUse =
@@ -739,7 +768,7 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
   const int VALUWaitStates = 1;
   int WaitStatesNeeded = 0;
 
-  if (!TRI->isVGPR(MRI, Def.getReg()))
+  if (!TRI->isVectorRegister(MRI, Def.getReg()))
     return WaitStatesNeeded;
   Register Reg = Def.getReg();
   auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
@@ -1187,6 +1216,10 @@ int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
   assert(SIInstrInfo::isMAI(*MI));
 
+  return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
+}
+
+int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
   int WaitStatesNeeded = 0;
   unsigned Opc = MI->getOpcode();
 
@@ -1353,8 +1386,166 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
   return WaitStatesNeeded;
 }
 
+int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
+  int WaitStatesNeeded = 0;
+  unsigned Opc = MI->getOpcode();
+
+  auto IsMFMAFn = [] (MachineInstr *MI) {
+    return SIInstrInfo::isMAI(*MI) &&
+           MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+           MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
+  };
+
+  auto IsLegacyVALUFn = [&IsMFMAFn] (MachineInstr *MI) {
+    return SIInstrInfo::isVALU(*MI) && !IsMFMAFn(MI);
+  };
+
+  auto IsLegacyVALUNotDotFn = [&IsMFMAFn] (MachineInstr *MI) {
+    return SIInstrInfo::isVALU(*MI) &&
+           !IsMFMAFn(MI) && !SIInstrInfo::isDOT(*MI);
+  };
+
+  if (!IsMFMAFn(MI))
+    return WaitStatesNeeded;
+
+  const int VALUWritesExecWaitStates = 4;
+  int WaitStatesNeededForUse = VALUWritesExecWaitStates -
+    getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
+                          VALUWritesExecWaitStates);
+  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+  int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+
+  // Loop for both DGEMM and S/HGEMM 2nd instruction.
+  for (const MachineOperand &Use : MI->explicit_uses()) {
+    const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
+    const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
+    const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
+    const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
+    const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
+    const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
+    const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
+    const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
+    const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
+    const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
+    const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
+    const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
+    const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
+    const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
+    const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
+    const int MaxWaitStates = 19;
+
+    if (!Use.isReg())
+      continue;
+    unsigned Reg = Use.getReg();
+    bool FullReg;
+    MachineInstr *MI1;
+
+    auto IsOverlappedDGEMMorXDLFn = [Reg, &IsMFMAFn, &FullReg, &MI1, this]
+                                    (MachineInstr *MI) {
+      if (!IsMFMAFn(MI))
+        return false;
+      if (!isDGEMM(MI->getOpcode()) && !isXDL(ST, *MI))
+        return false;
+      Register DstReg = MI->getOperand(0).getReg();
+      FullReg = (DstReg == Reg);
+      MI1 = MI;
+      return TRI.regsOverlap(DstReg, Reg);
+    };
+
+    WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
+      getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+    int NumWaitStates = getWaitStatesSinceDef(Reg, IsOverlappedDGEMMorXDLFn,
+                                              MaxWaitStates);
+    if (NumWaitStates == std::numeric_limits<int>::max())
+      continue;
+
+    int OpNo = MI->getOperandNo(&Use);
+    unsigned Opc1 = MI1->getOpcode();
+    int NeedWaitStates = 0;
+    if (OpNo == SrcCIdx) {
+      if (!isDGEMM(Opc) && isDGEMM(Opc1)) {
+        NeedWaitStates = 0;
+      } else if (FullReg) {
+        if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
+             Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
+            (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
+             Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
+          NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
+      } else {
+        switch (Opc1) {
+        case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
+        case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
+          if (!isXDL(ST, *MI))
+            NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
+          break;
+        case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
+        case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
+          if (!isXDL(ST, *MI))
+            NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
+          break;
+        default:
+          switch (TSchedModel.computeInstrLatency(MI1)) {
+          case 2:
+            NeedWaitStates = isDGEMM(Opc)
+              ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
+              : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
+            break;
+          case 8:
+            NeedWaitStates = isDGEMM(Opc)
+              ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
+              : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
+            break;
+          case 16: LLVM_FALLTHROUGH;
+          default:
+            NeedWaitStates = isDGEMM(Opc)
+              ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
+              : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
+          }
+        }
+      }
+    } else {
+      switch (Opc1) {
+      case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
+      case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
+        NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
+        break;
+      case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
+      case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
+        NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
+        break;
+      default:
+        switch (TSchedModel.computeInstrLatency(MI1)) {
+        case 2:
+          NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
+          break;
+        case 8:
+          NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
+          break;
+        case 16: LLVM_FALLTHROUGH;
+        default:
+          NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
+        }
+      }
+    }
+    if (WaitStatesNeeded >= NeedWaitStates)
+      continue;
+
+    WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+    if (WaitStatesNeeded == MaxWaitStates)
+      break;
+  }
+
+  return WaitStatesNeeded;
+}
+
 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
-  if (!ST.hasMAIInsts())
+  // On gfx90a+ releveant hazards are checked in checkMAIVALUHazards()
+  if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
     return 0;
 
   int WaitStatesNeeded = 0;
@@ -1399,6 +1590,234 @@ int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
   return WaitStatesNeeded;
 }
 
+int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
+  if (!ST.hasGFX90AInsts())
+    return 0;
+
+  auto IsMFMAFn = [] (MachineInstr *MI) -> bool {
+    return SIInstrInfo::isMAI(*MI) &&
+           MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+           MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
+  };
+
+  auto IsDGEMMFn = [] (MachineInstr *MI) -> bool {
+    return isDGEMM(MI->getOpcode());
+  };
+
+  // This is checked in checkMAIHazards90A()
+  if (IsMFMAFn(MI))
+    return 0;
+
+  int WaitStatesNeeded = 0;
+
+  bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) ||
+                       SIInstrInfo::isFLAT(*MI) ||
+                       SIInstrInfo::isDS(*MI) ||
+                       SIInstrInfo::isEXP(*MI);
+  bool IsVALU = SIInstrInfo::isVALU(*MI);
+
+  MachineInstr *MFMA = nullptr;
+  unsigned Reg;
+  auto IsDGEMMorXDLWriteFn = [&Reg, &IsMFMAFn, &MFMA, this] (MachineInstr *MI) {
+    if (!IsMFMAFn(MI) || !TRI.regsOverlap(MI->getOperand(0).getReg(), Reg))
+      return false;
+    if (!isDGEMM(MI->getOpcode()) && !isXDL(ST, *MI))
+      return false;
+    MFMA = MI;
+    return true;
+  };
+
+  MachineInstr *DOT = nullptr;
+  auto IsDotWriteFn = [&Reg, &DOT, this] (MachineInstr *MI) {
+    if (!SIInstrInfo::isDOT(*MI) ||
+        !TRI.regsOverlap(MI->getOperand(0).getReg(), Reg))
+      return false;
+    DOT = MI;
+    return true;
+  };
+
+  int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::src2);
+
+  if (IsMemOrExport || IsVALU) {
+    const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
+    const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
+    const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
+    const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
+    const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
+    const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
+    const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
+    const int DotWriteSameDotReadSrcAB = 3;
+    const int DotWriteDifferentVALURead = 3;
+    const int MaxWaitStates = 19;
+
+    for (const MachineOperand &Use : MI->explicit_uses()) {
+      if (!Use.isReg())
+        continue;
+      Reg = Use.getReg();
+
+      DOT = nullptr;
+      int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
+                                                     MaxWaitStates);
+      if (DOT) {
+        int NeedWaitStates = 0;
+        if (DOT->getOpcode() == MI->getOpcode()) {
+          if (&Use - &MI->getOperand(0) != SrcCIdx)
+            NeedWaitStates = DotWriteSameDotReadSrcAB;
+        } else {
+          NeedWaitStates = DotWriteDifferentVALURead;
+        }
+
+        int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
+        WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+      }
+
+      MFMA = nullptr;
+      WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
+                                                 MaxWaitStates);
+      if (!MFMA)
+        continue;
+
+      unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
+      int NeedWaitStates = MaxWaitStates;
+      switch (HazardDefLatency) {
+      case 2:
+        NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
+        break;
+      case 4:
+        assert(isDGEMM(MFMA->getOpcode()));
+        NeedWaitStates =
+            IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
+                          : DMFMA4x4WriteVgprVALUReadWaitStates;
+        break;
+      case 8:
+        NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
+        break;
+      case 16: LLVM_FALLTHROUGH;
+      default:
+        NeedWaitStates =
+          isDGEMM(MFMA->getOpcode())
+            ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
+                            : DMFMA16x16WriteVgprVALUReadWaitStates
+            : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
+        break;
+      }
+
+      int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
+      WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+      if (WaitStatesNeeded == MaxWaitStates)
+        break;
+    }
+  }
+
+  unsigned Opc = MI->getOpcode();
+  const int DMFMAToFMA64WaitStates = 2;
+  if ((Opc == AMDGPU::V_FMA_F64_e64 ||
+       Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
+       Opc == AMDGPU::V_FMAC_F64_dpp) &&
+      WaitStatesNeeded < DMFMAToFMA64WaitStates) {
+    int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
+      getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+  }
+
+  if (!IsVALU && !IsMemOrExport)
+    return WaitStatesNeeded;
+
+  for (const MachineOperand &Def : MI->defs()) {
+    const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
+    const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
+    const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
+    const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
+    const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
+    const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
+    const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
+    const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
+    const int DotWriteDifferentVALUWrite = 3;
+    const int MaxWaitStates = 19;
+    const int MaxWarWaitStates = 15;
+
+    Reg = Def.getReg();
+
+    DOT = nullptr;
+    int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
+                                                   MaxWaitStates);
+    if (DOT && DOT->getOpcode() != MI->getOpcode())
+      WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
+                                                    WaitStatesSinceDef);
+
+    MFMA = nullptr;
+    WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
+                                               MaxWaitStates);
+    if (MFMA) {
+      int NeedWaitStates = MaxWaitStates;
+      switch (TSchedModel.computeInstrLatency(MFMA)) {
+      case 2:
+        NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
+        break;
+      case 4:
+        assert(isDGEMM(MFMA->getOpcode()));
+        NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
+        break;
+      case 8:
+        NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
+        break;
+      case 16: LLVM_FALLTHROUGH;
+      default:
+        NeedWaitStates = isDGEMM(MFMA->getOpcode())
+                   ? DMFMA16x16WriteVgprVALUWriteWaitStates
+                   : SMFMA32x32WriteVgprVALUWawWaitStates;
+        break;
+      }
+
+      int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
+      WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+      if (WaitStatesNeeded == MaxWaitStates)
+        break;
+    }
+
+    auto IsSMFMAReadAsCFn = [&Reg, &IsMFMAFn, &MFMA, this]
+                               (MachineInstr *MI) {
+      if (!IsMFMAFn(MI) || isDGEMM(MI->getOpcode()) ||
+          !MI->readsRegister(Reg, &TRI))
+        return false;
+
+      MachineOperand *SrcC = TII.getNamedOperand(*MI, AMDGPU::OpName::src2);
+      assert(SrcC);
+      if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
+        return false;
+
+      MFMA = MI;
+      return true;
+    };
+
+    MFMA = nullptr;
+    int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
+                                                MaxWarWaitStates);
+    if (!MFMA)
+      continue;
+
+    unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
+    int NeedWaitStates = MaxWaitStates;
+    switch (HazardDefLatency) {
+    case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
+             break;
+    case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
+             break;
+    case 16: LLVM_FALLTHROUGH;
+    default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
+             break;
+    }
+
+    int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+  }
+
+  return WaitStatesNeeded;
+}
+
 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
   if (!SU->isInstr())
     return false;

diff  --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 447ca828ae64..6d61af3ac876 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -94,6 +94,9 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   bool fixLdsBranchVmemWARHazard(MachineInstr *MI);
 
   int checkMAIHazards(MachineInstr *MI);
+  int checkMAIHazards908(MachineInstr *MI);
+  int checkMAIHazards90A(MachineInstr *MI);
+  int checkMAIVALUHazards(MachineInstr *MI);
   int checkMAILdStHazards(MachineInstr *MI);
 
 public:

diff  --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index 7447ec2db188..888bbbb9d3c1 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -184,6 +184,10 @@ def : ProcessorModel<"gfx909", SIQuarterSpeedModel,
   FeatureISAVersion9_0_9.Features
 >;
 
+def : ProcessorModel<"gfx90a", SIDPFullSpeedModel,
+  FeatureISAVersion9_0_A.Features
+>;
+
 def : ProcessorModel<"gfx90c", SIQuarterSpeedModel,
   FeatureISAVersion9_0_C.Features
 >;

diff  --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 55fd2b5eb284..3456f9a6156c 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -125,12 +125,14 @@ bool GCNRegPressure::less(const GCNSubtarget &ST,
                           unsigned MaxOccupancy) const {
   const auto SGPROcc = std::min(MaxOccupancy,
                                 ST.getOccupancyWithNumSGPRs(getSGPRNum()));
-  const auto VGPROcc = std::min(MaxOccupancy,
-                                ST.getOccupancyWithNumVGPRs(getVGPRNum()));
+  const auto VGPROcc =
+    std::min(MaxOccupancy,
+             ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts())));
   const auto OtherSGPROcc = std::min(MaxOccupancy,
                                 ST.getOccupancyWithNumSGPRs(O.getSGPRNum()));
-  const auto OtherVGPROcc = std::min(MaxOccupancy,
-                                ST.getOccupancyWithNumVGPRs(O.getVGPRNum()));
+  const auto OtherVGPROcc =
+    std::min(MaxOccupancy,
+             ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts())));
 
   const auto Occ = std::min(SGPROcc, VGPROcc);
   const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
@@ -161,7 +163,8 @@ bool GCNRegPressure::less(const GCNSubtarget &ST,
     }
   }
   return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()):
-                         (getVGPRNum() < O.getVGPRNum());
+                         (getVGPRNum(ST.hasGFX90AInsts()) <
+                          O.getVGPRNum(ST.hasGFX90AInsts()));
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -169,7 +172,9 @@ LLVM_DUMP_METHOD
 void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const {
   OS << "VGPRs: " << Value[VGPR32] << ' ';
   OS << "AGPRs: " << Value[AGPR32];
-  if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')';
+  if (ST) OS << "(O"
+             << ST->getOccupancyWithNumVGPRs(getVGPRNum(ST->hasGFX90AInsts()))
+             << ')';
   OS << ", SGPRs: " << getSGPRNum();
   if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGPRNum()) << ')';
   OS << ", LVGPR WT: " << getVGPRTuplesWeight()

diff  --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 8541c7e983f7..257561cb8430 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -42,12 +42,19 @@ struct GCNRegPressure {
     clear();
   }
 
-  bool empty() const { return getSGPRNum() == 0 && getVGPRNum() == 0; }
+  bool empty() const { return getSGPRNum() == 0 && getVGPRNum(false) == 0; }
 
   void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }
 
   unsigned getSGPRNum() const { return Value[SGPR32]; }
-  unsigned getVGPRNum() const { return std::max(Value[VGPR32], Value[AGPR32]); }
+  unsigned getVGPRNum(bool UnifiedVGPRFile) const {
+    if (UnifiedVGPRFile) {
+      return Value[AGPR32] ? alignTo(Value[VGPR32], 4) + Value[AGPR32]
+                           : Value[VGPR32] + Value[AGPR32];
+    }
+    return std::max(Value[VGPR32], Value[AGPR32]);
+  }
+  unsigned getAGPRNum() const { return Value[AGPR32]; }
 
   unsigned getVGPRTuplesWeight() const { return std::max(Value[VGPR_TUPLE],
                                                          Value[AGPR_TUPLE]); }
@@ -55,7 +62,7 @@ struct GCNRegPressure {
 
   unsigned getOccupancy(const GCNSubtarget &ST) const {
     return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
-                    ST.getOccupancyWithNumVGPRs(getVGPRNum()));
+             ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts())));
   }
 
   void inc(unsigned Reg,

diff  --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 6e2550298dc6..18a534d28b21 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -335,7 +335,7 @@ void GCNScheduleDAGMILive::schedule() {
              PressureAfter.print(dbgs()));
 
   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
-      PressureAfter.getVGPRNum() <= S.VGPRCriticalLimit) {
+      PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
     Pressure[RegionIdx] = PressureAfter;
     LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
     return;
@@ -366,7 +366,8 @@ void GCNScheduleDAGMILive::schedule() {
 
   unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
   unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
-  if (PressureAfter.getVGPRNum() > MaxVGPRs ||
+  if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
+      PressureAfter.getAGPRNum() > MaxVGPRs ||
       PressureAfter.getSGPRNum() > MaxSGPRs)
     RescheduleRegions[RegionIdx] = true;
 

diff  --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 7a7178126444..c2f172aa5a5a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -82,6 +82,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool FastFMAF32;
   bool FastDenormalF32;
   bool HalfRate64Ops;
+  bool FullRate64Ops;
 
   // Dynamically set bits that enable features.
   bool FlatForGlobal;
@@ -95,6 +96,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // for XNACK.
   bool EnableXNACK;
 
+  bool EnableTgSplit;
   bool EnableCuMode;
   bool TrapHandler;
 
@@ -110,10 +112,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool FP64;
   bool FMA;
   bool MIMG_R128;
-  bool GCN3Encoding;
+  bool IsGCN;
   bool CIInsts;
   bool GFX8Insts;
   bool GFX9Insts;
+  bool GFX90AInsts;
   bool GFX10Insts;
   bool GFX10_3Insts;
   bool GFX7GFX8GFX9Insts;
@@ -132,6 +135,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasSDWAOutModsVOPC;
   bool HasDPP;
   bool HasDPP8;
+  bool Has64BitDPP;
+  bool HasPackedFP32Ops;
+  bool HasExtendedImageInsts;
   bool HasR128A16;
   bool HasGFX10A16;
   bool HasG16;
@@ -167,10 +173,16 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool ScalarFlatScratchInsts;
   bool AddNoCarryInsts;
   bool HasUnpackedD16VMem;
+  bool R600ALUInst;
+  bool CaymanISA;
+  bool CFALUBug;
   bool LDSMisalignedBug;
   bool HasMFMAInlineLiteralBug;
+  bool HasVertexCache;
+  short TexVTXClauseSize;
   bool UnalignedBufferAccess;
   bool UnalignedDSAccess;
+  bool HasPackedTID;
   bool ScalarizeGlobal;
 
   bool HasVcmpxPermlaneHazard;
@@ -295,6 +307,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return HalfRate64Ops;
   }
 
+  bool hasFullRate64Ops() const {
+    return FullRate64Ops;
+  }
+
   bool hasAddr64() const {
     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
   }
@@ -510,6 +526,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return TargetID.isXnackOnOrAny();
   }
 
+  bool isTgSplitEnabled() const {
+    return EnableTgSplit;
+  }
+
   bool isCuModeEnabled() const {
     return EnableCuMode;
   }
@@ -796,6 +816,18 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return HasDPP8;
   }
 
+  bool has64BitDPP() const {
+    return Has64BitDPP;
+  }
+
+  bool hasPackedFP32Ops() const {
+    return HasPackedFP32Ops;
+  }
+
+  bool hasExtendedImageInsts() const {
+    return HasExtendedImageInsts;
+  }
+
   bool hasR128A16() const {
     return HasR128A16;
   }
@@ -896,6 +928,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasHardClauses() const { return getGeneration() >= GFX10; }
 
+  bool hasGFX90AInsts() const { return GFX90AInsts; }
+
+  bool hasPackedTID() const { return HasPackedTID; }
+
   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
   /// SGPRs
   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;

diff  --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 00bc9b2b65db..4b713ef93d48 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -10,6 +10,7 @@
 #include "AMDGPUInstPrinter.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIDefines.h"
+#include "SIRegisterInfo.h"
 #include "Utils/AMDGPUAsmUtils.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/MC/MCExpr.h"
@@ -207,6 +208,12 @@ void AMDGPUInstPrinter::printDLC(const MCInst *MI, unsigned OpNo,
     printNamedBit(MI, OpNo, O, "dlc");
 }
 
+void AMDGPUInstPrinter::printSCCB(const MCInst *MI, unsigned OpNo,
+                                  const MCSubtargetInfo &STI, raw_ostream &O) {
+  if (AMDGPU::isGFX90A(STI))
+    printNamedBit(MI, OpNo, O, "scc");
+}
+
 void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo,
                                  const MCSubtargetInfo &STI, raw_ostream &O) {
   printNamedBit(MI, OpNo, O, "glc");
@@ -601,6 +608,10 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     case AMDGPU::OPERAND_REG_INLINE_C_FP32:
     case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
     case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+    case AMDGPU::OPERAND_REG_IMM_V2INT32:
+    case AMDGPU::OPERAND_REG_IMM_V2FP32:
+    case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
+    case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
     case MCOI::OPERAND_IMMEDIATE:
       printImmediate32(Op.getImm(), STI, O);
       break;
@@ -608,6 +619,7 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     case AMDGPU::OPERAND_REG_IMM_FP64:
     case AMDGPU::OPERAND_REG_INLINE_C_INT64:
     case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+    case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
       printImmediate64(Op.getImm(), STI, O);
       break;
     case AMDGPU::OPERAND_REG_INLINE_C_INT16:
@@ -794,7 +806,20 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
   using namespace AMDGPU::DPP;
 
   unsigned Imm = MI->getOperand(OpNo).getImm();
-  if (Imm <= DppCtrl::QUAD_PERM_LAST) {
+  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+  int DstIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                          AMDGPU::OpName::vdst);
+  int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::src0);
+
+  if (((DstIdx >= 0 &&
+        Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64RegClassID) ||
+      ((Src0Idx >= 0 &&
+        Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID))) &&
+      !AMDGPU::isLegal64BitDPPControl(Imm)) {
+    O << " /* 64 bit dpp only supports row_newbcast */";
+    return;
+  } else if (Imm <= DppCtrl::QUAD_PERM_LAST) {
     O << "quad_perm:[";
     O << formatDec(Imm & 0x3)         << ',';
     O << formatDec((Imm & 0xc)  >> 2) << ',';
@@ -854,11 +879,15 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
     O << "row_bcast:31";
   } else if ((Imm >= DppCtrl::ROW_SHARE_FIRST) &&
              (Imm <= DppCtrl::ROW_SHARE_LAST)) {
-    if (!AMDGPU::isGFX10Plus(STI)) {
-      O << "/* row_share is not supported on ASICs earlier than GFX10 */";
+    if (AMDGPU::isGFX90A(STI)) {
+      O << " row_newbcast:";
+    } else if (AMDGPU::isGFX10Plus(STI)) {
+      O << "row_share:";
+    } else {
+      O << " /* row_newbcast/row_share is not supported on ASICs earlier "
+           "than GFX90A/GFX10 */";
       return;
     }
-    O << "row_share:";
     printU4ImmDecOperand(MI, OpNo, O);
   } else if ((Imm >= DppCtrl::ROW_XMASK_FIRST) &&
              (Imm <= DppCtrl::ROW_XMASK_LAST)) {

diff  --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 8d13aa682211..121195fe8e20 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -70,6 +70,8 @@ class AMDGPUInstPrinter : public MCInstPrinter {
                 raw_ostream &O);
   void printDLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                 raw_ostream &O);
+  void printSCCB(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                 raw_ostream &O);
   void printGLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                 raw_ostream &O);
   void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,

diff  --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index f0eb11b70c97..842356008a4c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -86,6 +86,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906:  AK = GK_GFX906;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX908:  AK = GK_GFX908;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909:  AK = GK_GFX909;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A:  AK = GK_GFX90A;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C:  AK = GK_GFX90C;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break;
@@ -145,6 +146,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
   case GK_GFX906:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX906;
   case GK_GFX908:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX908;
   case GK_GFX909:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909;
+  case GK_GFX90A:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A;
   case GK_GFX90C:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C;
   case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010;
   case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011;
@@ -258,10 +260,19 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
   return true;
 }
 
-bool AMDGPUTargetAsmStreamer::EmitCodeEnd() {
+bool AMDGPUTargetAsmStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
   const uint32_t Encoded_s_code_end = 0xbf9f0000;
-  OS << "\t.p2alignl 6, " << Encoded_s_code_end << '\n';
-  OS << "\t.fill 48, 4, " << Encoded_s_code_end << '\n';
+  const uint32_t Encoded_s_nop = 0xbf800000;
+  uint32_t Encoded_pad = Encoded_s_code_end;
+  unsigned FillSize = 48;
+
+  if (AMDGPU::isGFX90A(STI)) {
+    Encoded_pad = Encoded_s_nop;
+    FillSize = 256;
+  }
+
+  OS << "\t.p2alignl 6, " << Encoded_pad << '\n';
+  OS << "\t.fill " << FillSize << ", 4, " << Encoded_pad << '\n';
   return true;
 }
 
@@ -331,6 +342,12 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
   OS << "\t\t.amdhsa_next_free_vgpr " << NextVGPR << '\n';
   OS << "\t\t.amdhsa_next_free_sgpr " << NextSGPR << '\n';
 
+  if (AMDGPU::isGFX90A(STI))
+    OS << "\t\t.amdhsa_accum_offset " <<
+      (AMDHSA_BITS_GET(KD.compute_pgm_rsrc3,
+                       amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET) + 1) * 4
+      << '\n';
+
   if (!ReserveVCC)
     OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n';
   if (IVersion.Major >= 7 && !ReserveFlatScr)
@@ -360,6 +377,10 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
     PRINT_FIELD(OS, ".amdhsa_fp16_overflow", KD,
                 compute_pgm_rsrc1,
                 amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL);
+  if (AMDGPU::isGFX90A(STI))
+    PRINT_FIELD(OS, ".amdhsa_tg_split", KD,
+                compute_pgm_rsrc3,
+                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT);
   if (IVersion.Major >= 10) {
     PRINT_FIELD(OS, ".amdhsa_workgroup_processor_mode", KD,
                 compute_pgm_rsrc1,
@@ -616,14 +637,22 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
   return true;
 }
 
-bool AMDGPUTargetELFStreamer::EmitCodeEnd() {
+bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
   const uint32_t Encoded_s_code_end = 0xbf9f0000;
+  const uint32_t Encoded_s_nop = 0xbf800000;
+  uint32_t Encoded_pad = Encoded_s_code_end;
+  unsigned FillSize = 48;
+
+  if (AMDGPU::isGFX90A(STI)) {
+    Encoded_pad = Encoded_s_nop;
+    FillSize = 256;
+  }
 
   MCStreamer &OS = getStreamer();
   OS.PushSection();
-  OS.emitValueToAlignment(64, Encoded_s_code_end, 4);
-  for (unsigned I = 0; I < 48; ++I)
-    OS.emitInt32(Encoded_s_code_end);
+  OS.emitValueToAlignment(64, Encoded_pad, 4);
+  for (unsigned I = 0; I < FillSize; ++I)
+    OS.emitInt32(Encoded_pad);
   OS.PopSection();
   return true;
 }

diff  --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 4dbb37dca9e8..a29fc94ca523 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -85,7 +85,7 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
   virtual bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) = 0;
 
   /// \returns True on success, false on failure.
-  virtual bool EmitCodeEnd() = 0;
+  virtual bool EmitCodeEnd(const MCSubtargetInfo &STI) = 0;
 
   virtual void EmitAmdhsaKernelDescriptor(
       const MCSubtargetInfo &STI, StringRef KernelName,
@@ -129,7 +129,7 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
   bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
 
   /// \returns True on success, false on failure.
-  bool EmitCodeEnd() override;
+  bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
 
   void EmitAmdhsaKernelDescriptor(
       const MCSubtargetInfo &STI, StringRef KernelName,
@@ -177,7 +177,7 @@ class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
   bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
 
   /// \returns True on success, false on failure.
-  bool EmitCodeEnd() override;
+  bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
 
   void EmitAmdhsaKernelDescriptor(
       const MCSubtargetInfo &STI, StringRef KernelName,

diff  --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index e0ca60bc3023..3fb11487a98d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -234,12 +234,17 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
   case AMDGPU::OPERAND_REG_INLINE_C_FP32:
   case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
   case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+  case AMDGPU::OPERAND_REG_IMM_V2INT32:
+  case AMDGPU::OPERAND_REG_IMM_V2FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
     return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
 
   case AMDGPU::OPERAND_REG_IMM_INT64:
   case AMDGPU::OPERAND_REG_IMM_FP64:
   case AMDGPU::OPERAND_REG_INLINE_C_INT64:
   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
     return getLit64Encoding(static_cast<uint64_t>(Imm), STI);
 
   case AMDGPU::OPERAND_REG_IMM_INT16:

diff  --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 6769b10d9123..404d5d8fa8c6 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -11,12 +11,14 @@
 //
 // - MIMGEncGfx6: encoding introduced with gfx6 (obsoleted for atomics in gfx8)
 // - MIMGEncGfx8: encoding introduced with gfx8 for atomics
+// - MIMGEncGfx90a: encoding for gfx90a for atomics
 // - MIMGEncGfx10Default: gfx10 default (non-NSA) encoding
 // - MIMGEncGfx10NSA: gfx10 NSA encoding
 class MIMGEncoding;
 
 def MIMGEncGfx6 : MIMGEncoding;
 def MIMGEncGfx8 : MIMGEncoding;
+def MIMGEncGfx90a : MIMGEncoding;
 def MIMGEncGfx10Default : MIMGEncoding;
 def MIMGEncGfx10NSA : MIMGEncoding;
 
@@ -207,14 +209,24 @@ class MIMGNSAHelper<int num_addrs> {
 // Base class of all pre-gfx10 MIMG instructions.
 class MIMG_gfx6789<bits<8> op, dag outs, string dns = "">
   : MIMG<outs, dns>, MIMGe_gfx6789<op> {
-  let SubtargetPredicate = isGFX6GFX7GFX8GFX9;
-  let AssemblerPredicate = isGFX6GFX7GFX8GFX9;
+  let SubtargetPredicate = isGFX6GFX7GFX8GFX9NotGFX90A;
+  let AssemblerPredicate = isGFX6GFX7GFX8GFX9NotGFX90A;
 
   let MIMGEncoding = MIMGEncGfx6;
 
   let d16 = !if(BaseOpcode.HasD16, ?, 0);
 }
 
+class MIMG_gfx90a<bits<8> op, dag outs, string dns = "">
+  : MIMG<outs, dns>, MIMGe_gfx90a<op> {
+  let SubtargetPredicate = isGFX90APlus;
+  let AssemblerPredicate = isGFX90APlus;
+
+  let MIMGEncoding = MIMGEncGfx90a;
+
+  let d16 = !if(BaseOpcode.HasD16, ?, 0);
+}
+
 // Base class of all non-NSA gfx10 MIMG instructions.
 class MIMG_gfx10<int op, dag outs, string dns = "">
   : MIMG<outs, dns>, MIMGe_gfx10<op> {
@@ -250,10 +262,23 @@ class MIMG_NoSampler_Helper <mimgopc op, string asm,
                              string dns="">
   : MIMG_gfx6789 <op.BASE, (outs dst_rc:$vdata), dns> {
   let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc,
-                                DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+                                DMask:$dmask, UNorm:$unorm, SCCB_0:$sccb, GLC:$glc, SLC:$slc,
                                 R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
-  let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+  let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$sccb$glc$slc$r128$tfe$lwe$da"
+                      #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_NoSampler_Helper_gfx90a <mimgopc op, string asm,
+                                    RegisterClass dst_rc,
+                                    RegisterClass addr_rc,
+                                    string dns="">
+  : MIMG_gfx90a <op.BASE, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> {
+  let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc,
+                                DMask:$dmask, UNorm:$unorm, SCCB_0:$sccb, GLC:$glc, SLC:$slc,
+                                R128A16:$r128, LWE:$lwe, DA:$da),
+                           !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+  let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$sccb$glc$slc$r128$lwe$da"
                       #!if(BaseOpcode.HasD16, "$d16", "");
 }
 
@@ -284,12 +309,16 @@ class MIMG_NoSampler_nsa_gfx10<mimgopc op, string opcode,
 
 multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
                                              RegisterClass dst_rc,
-                                             bit enableDisasm> {
+                                             bit enableDisasm,
+                                             bit ExtendedImageInst = 1> {
   let ssamp = 0 in {
     let VAddrDwords = 1 in {
       if op.HAS_BASE then {
         def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
                                          !if(enableDisasm, "AMDGPU", "")>;
+        foreach _ = BoolToList<!eq(ExtendedImageInst, 0)>.ret in
+        def _V1_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VGPR_32,
+                                       !if(enableDisasm, "GFX90A", "")>;
         def _V1_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VGPR_32,
                                              !if(enableDisasm, "AMDGPU", "")>;
       }
@@ -298,6 +327,8 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
     let VAddrDwords = 2 in {
       if op.HAS_BASE then {
         def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>;
+        foreach _ = BoolToList<!eq(ExtendedImageInst, 0)>.ret in
+        def _V2_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_64>;
         def _V2_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_64>;
         def _V2_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 2>;
       }
@@ -306,6 +337,8 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
     let VAddrDwords = 3 in {
       if op.HAS_BASE then {
         def _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>;
+        foreach _ = BoolToList<!eq(ExtendedImageInst, 0)>.ret in
+        def _V3_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_96>;
         def _V3_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_96>;
         def _V3_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 3>;
       }
@@ -314,6 +347,8 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
     let VAddrDwords = 4 in {
       if op.HAS_BASE then {
         def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>;
+        foreach _ = BoolToList<!eq(ExtendedImageInst, 0)>.ret in
+        def _V4_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_128>;
         def _V4_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_128>;
         def _V4_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 4,
                                                      !if(enableDisasm, "AMDGPU", "")>;
@@ -323,7 +358,8 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
 }
 
 multiclass MIMG_NoSampler <mimgopc op, string asm, bit has_d16, bit mip = 0,
-                           bit isResInfo = 0> {
+                           bit isResInfo = 0,
+                           bit msaa = 0> {
   def "" : MIMGBaseOpcode {
     let Coordinates = !not(isResInfo);
     let LodOrClampOrMip = mip;
@@ -333,15 +369,15 @@ multiclass MIMG_NoSampler <mimgopc op, string asm, bit has_d16, bit mip = 0,
   let BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
       mayLoad = !not(isResInfo) in {
     let VDataDwords = 1 in
-    defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
+    defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1, msaa>;
     let VDataDwords = 2 in
-    defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 0>;
+    defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 0, msaa>;
     let VDataDwords = 3 in
-    defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>;
+    defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0, msaa>;
     let VDataDwords = 4 in
-    defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>;
+    defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0, msaa>;
     let VDataDwords = 5 in
-    defm _V5 : MIMG_NoSampler_Src_Helper <op, asm, VReg_160, 0>;
+    defm _V5 : MIMG_NoSampler_Src_Helper <op, asm, VReg_160, 0, msaa>;
   }
 }
 
@@ -351,10 +387,24 @@ class MIMG_Store_Helper <mimgopc op, string asm,
                          string dns = "">
   : MIMG_gfx6789<op.BASE, (outs), dns> {
   let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
-                                DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+                                DMask:$dmask, UNorm:$unorm, SCCB_0:$sccb, GLC:$glc, SLC:$slc,
                                 R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
-  let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+  let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$sccb$glc$slc$r128$tfe$lwe$da"
+                      #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_Store_Helper_gfx90a <mimgopc op, string asm,
+                                RegisterClass data_rc,
+                                RegisterClass addr_rc,
+                                string dns = "">
+  : MIMG_gfx90a<op.BASE, (outs), dns> {
+  let InOperandList = !con((ins getLdStRegisterOperand<data_rc>.ret:$vdata,
+                                addr_rc:$vaddr, SReg_256:$srsrc,
+                                DMask:$dmask, UNorm:$unorm, SCCB_0:$sccb, GLC:$glc, SLC:$slc,
+                                R128A16:$r128, LWE:$lwe, DA:$da),
+                           !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+  let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$sccb$glc$slc$r128$lwe$da"
                       #!if(BaseOpcode.HasD16, "$d16", "");
 }
 
@@ -393,6 +443,8 @@ multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm,
       if op.HAS_BASE then {
         def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
                                      !if(enableDisasm, "AMDGPU", "")>;
+        def _V1_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VGPR_32,
+                                     !if(enableDisasm, "GFX90A", "")>;
         def _V1_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VGPR_32,
                                           !if(enableDisasm, "AMDGPU", "")>;
       }
@@ -400,6 +452,7 @@ multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm,
     let VAddrDwords = 2 in {
       if op.HAS_BASE then {
         def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>;
+        def _V2_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_64>;
         def _V2_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_64>;
         def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 2>;
       }
@@ -407,6 +460,7 @@ multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm,
     let VAddrDwords = 3 in {
       if op.HAS_BASE then {
         def _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>;
+        def _V3_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_96>;
         def _V3_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_96>;
         def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 3>;
       }
@@ -414,6 +468,7 @@ multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm,
     let VAddrDwords = 4 in {
       if op.HAS_BASE then {
         def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>;
+        def _V4_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_128>;
         def _V4_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_128>;
         def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 4,
                                                          !if(enableDisasm, "AMDGPU", "")>;
@@ -450,9 +505,22 @@ class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterClass data_rc,
   let AsmMatchConverter = "cvtMIMGAtomic";
 
   let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
-                           DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+                           DMask:$dmask, UNorm:$unorm, SCCB_0:$sccb, GLC:$glc, SLC:$slc,
                            R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da);
-  let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da";
+  let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$sccb$glc$slc$r128$tfe$lwe$da";
+}
+
+class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterClass data_rc,
+                               RegisterClass addr_rc, string dns="">
+  : MIMG_gfx90a <op, (outs getLdStRegisterOperand<data_rc>.ret:$vdst), dns> {
+  let Constraints = "$vdst = $vdata";
+  let AsmMatchConverter = "cvtMIMGAtomic";
+
+  let InOperandList = (ins getLdStRegisterOperand<data_rc>.ret:$vdata,
+                           addr_rc:$vaddr, SReg_256:$srsrc,
+                           DMask:$dmask, UNorm:$unorm, SCCB_0:$sccb, GLC:$glc, SLC:$slc,
+                           R128A16:$r128, LWE:$lwe, DA:$da);
+  let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$sccb$glc$slc$r128$lwe$da";
 }
 
 class MIMG_Atomic_si<mimgopc op, string asm, RegisterClass data_rc,
@@ -465,10 +533,17 @@ class MIMG_Atomic_si<mimgopc op, string asm, RegisterClass data_rc,
 class MIMG_Atomic_vi<mimgopc op, string asm, RegisterClass data_rc,
                      RegisterClass addr_rc, bit enableDasm = 0>
   : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX8", "")> {
-  let AssemblerPredicate = isGFX8GFX9;
+  let AssemblerPredicate = isGFX8GFX9NotGFX90A;
   let MIMGEncoding = MIMGEncGfx8;
 }
 
+class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterClass data_rc,
+                         RegisterClass addr_rc, bit enableDasm = 0>
+  : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX90A", "")> {
+  let AssemblerPredicate = isGFX90APlus;
+  let MIMGEncoding = MIMGEncGfx90a;
+}
+
 class MIMG_Atomic_gfx10<mimgopc op, string opcode,
                         RegisterClass DataRC, RegisterClass AddrRC,
                         bit enableDisasm = 0>
@@ -512,6 +587,7 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
       }
       if op.HAS_VI then {
         def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>;
+        def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, enableDasm>;
       }
       if op.HAS_BASE then {
         def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>;
@@ -523,6 +599,7 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
       }
       if op.HAS_VI then {
         def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>;
+        def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64, 0>;
       }
       if op.HAS_BASE then {
         def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>;
@@ -535,6 +612,7 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
       }
       if op.HAS_VI then {
         def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>;
+        def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96, 0>;
       }
       if op.HAS_BASE then {
         def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>;
@@ -542,12 +620,13 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
       }
     }
     let VAddrDwords = 4 in {
-      if op.HAS_VI then {
-        def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>;
-      }
       if op.HAS_SI then {
         def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>;
       }
+      if op.HAS_VI then {
+        def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>;
+        def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128, 0>;
+      }
       if op.HAS_BASE then {
         def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>;
         def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>;
@@ -579,10 +658,21 @@ class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc,
                            RegisterClass src_rc, string dns="">
   : MIMG_gfx6789 <op.BASE, (outs dst_rc:$vdata), dns> {
   let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
-                                DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+                                DMask:$dmask, UNorm:$unorm, SCCB_0:$sccb, GLC:$glc, SLC:$slc,
                                 R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
-  let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+  let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$sccb$glc$slc$r128$tfe$lwe$da"
+                      #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterClass dst_rc,
+                          RegisterClass src_rc, string dns="">
+  : MIMG_gfx90a<op.BASE, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> {
+  let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
+                                DMask:$dmask, UNorm:$unorm, SCCB_0:$sccb, GLC:$glc, SLC:$slc,
+                                R128A16:$r128, LWE:$lwe, DA:$da),
+                           !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+  let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$sccb$glc$slc$r128$lwe$da"
                       #!if(BaseOpcode.HasD16, "$d16", "");
 }
 
@@ -686,13 +776,18 @@ class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample> {
 
 multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm,
                                     AMDGPUSampleVariant sample, RegisterClass dst_rc,
-                                    bit enableDisasm = 0> {
+                                    bit enableDisasm = 0,
+                                    bit ExtendedImageInst = 1> {
   foreach addr = MIMG_Sampler_AddrSizes<sample>.MachineInstrs in {
     let VAddrDwords = addr.NumWords in {
       if op.HAS_BASE then {
         def _V # addr.NumWords
           : MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass,
                                  !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+        foreach _ = BoolToList<!eq(ExtendedImageInst, 0)>.ret in
+        def _V # addr.NumWords # _gfx90a
+          : MIMG_Sampler_gfx90a <op, asm, dst_rc, addr.RegClass,
+                                 !if(!and(enableDisasm, addr.Disassemble), "GFX90A", "")>;
         def _V # addr.NumWords # _gfx10
           : MIMG_Sampler_gfx10 <op, asm, dst_rc, addr.RegClass,
                                  !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
@@ -721,7 +816,8 @@ class MIMG_Sampler_BaseOpcode<AMDGPUSampleVariant sample>
 
 multiclass MIMG_Sampler <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
                          bit isG16 = 0, bit isGetLod = 0,
-                         string asm = "image_sample"#sample.LowerCaseMod#!if(isG16, "_g16", "")> {
+                         string asm = "image_sample"#sample.LowerCaseMod#!if(isG16, "_g16", ""),
+                         bit ExtendedImageInst = !ne(sample.LowerCaseMod, "")> {
   def "" : MIMG_Sampler_BaseOpcode<sample> {
     let HasD16 = !not(isGetLod);
     let G16 = isG16;
@@ -730,15 +826,15 @@ multiclass MIMG_Sampler <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
   let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
       mayLoad = !not(isGetLod) in {
     let VDataDwords = 1 in
-    defm _V1 : MIMG_Sampler_Src_Helper<op, asm, sample, VGPR_32, 1>;
+    defm _V1 : MIMG_Sampler_Src_Helper<op, asm, sample, VGPR_32, 1, ExtendedImageInst>;
     let VDataDwords = 2 in
-    defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>;
+    defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64, 0, ExtendedImageInst>;
     let VDataDwords = 3 in
-    defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>;
+    defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96, 0, ExtendedImageInst>;
     let VDataDwords = 4 in
-    defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
+    defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 0, ExtendedImageInst>;
     let VDataDwords = 5 in
-    defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160>;
+    defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160, 0, ExtendedImageInst>;
   }
 }
 
@@ -848,7 +944,9 @@ defm IMAGE_ATOMIC_DEC           : MIMG_Atomic <mimgopc<0x1c>, "image_atomic_dec"
 defm IMAGE_ATOMIC_FCMPSWAP      : MIMG_Atomic <mimgopc<0x1d, MIMG.NOP>, "image_atomic_fcmpswap", 0, 1>;
 defm IMAGE_ATOMIC_FMIN          : MIMG_Atomic <mimgopc<0x1e, MIMG.NOP>, "image_atomic_fmin", 0, 1>;
 defm IMAGE_ATOMIC_FMAX          : MIMG_Atomic <mimgopc<0x1f, MIMG.NOP>, "image_atomic_fmax", 0, 1>;
+
 defm IMAGE_SAMPLE               : MIMG_Sampler_WQM <mimgopc<0x20>, AMDGPUSample>;
+let OtherPredicates = [HasExtendedImageInsts] in {
 defm IMAGE_SAMPLE_CL            : MIMG_Sampler_WQM <mimgopc<0x21>, AMDGPUSample_cl>;
 defm IMAGE_SAMPLE_D             : MIMG_Sampler <mimgopc<0x22>, AMDGPUSample_d>;
 defm IMAGE_SAMPLE_D_CL          : MIMG_Sampler <mimgopc<0x23>, AMDGPUSample_d_cl>;
@@ -932,12 +1030,12 @@ defm IMAGE_SAMPLE_CD_O_G16      : MIMG_Sampler <mimgopc<0xec>, AMDGPUSample_cd_o
 defm IMAGE_SAMPLE_CD_CL_O_G16   : MIMG_Sampler <mimgopc<0xed>, AMDGPUSample_cd_cl_o, 0, 1>;
 defm IMAGE_SAMPLE_C_CD_O_G16    : MIMG_Sampler <mimgopc<0xee>, AMDGPUSample_c_cd_o, 0, 1>;
 defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<0xef>, AMDGPUSample_c_cd_cl_o, 0, 1>;
-
-//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", mimgopc<0x7e>>;
-//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", mimgopc<0x7f>>;
+} // End OtherPredicates = [HasExtendedImageInsts]
+//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>;
+//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
 
 let SubtargetPredicate = HasGFX10_BEncoding in
-defm IMAGE_MSAA_LOAD : MIMG_NoSampler <mimgopc<0x80>, "image_msaa_load", 1>;
+defm IMAGE_MSAA_LOAD : MIMG_NoSampler <mimgopc<0x80>, "image_msaa_load", 1, 0, 0, 1>;
 
 defm IMAGE_BVH_INTERSECT_RAY       : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 11, 0>;
 defm IMAGE_BVH_INTERSECT_RAY_a16   : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 8, 1>;

diff  --git a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp b/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
index 3b753cb66ead..7d3a60a9b20c 100644
--- a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
@@ -77,7 +77,7 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
         if (!TFE && !LWE) // intersect_ray
           continue;
 
-        unsigned TFEVal = TFE->getImm();
+        unsigned TFEVal = TFE ? TFE->getImm() : 0;
         unsigned LWEVal = LWE->getImm();
         unsigned D16Val = D16 ? D16->getImm() : 0;
 

diff  --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 9b4e1048ab5c..5efea77b1bbd 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -142,6 +142,8 @@ namespace AMDGPU {
     OPERAND_REG_IMM_FP16,
     OPERAND_REG_IMM_V2FP16,
     OPERAND_REG_IMM_V2INT16,
+    OPERAND_REG_IMM_V2INT32,
+    OPERAND_REG_IMM_V2FP32,
 
     /// Operands with register or inline constant
     OPERAND_REG_INLINE_C_INT16,
@@ -150,25 +152,30 @@ namespace AMDGPU {
     OPERAND_REG_INLINE_C_FP16,
     OPERAND_REG_INLINE_C_FP32,
     OPERAND_REG_INLINE_C_FP64,
-    OPERAND_REG_INLINE_C_V2FP16,
     OPERAND_REG_INLINE_C_V2INT16,
+    OPERAND_REG_INLINE_C_V2FP16,
+    OPERAND_REG_INLINE_C_V2INT32,
+    OPERAND_REG_INLINE_C_V2FP32,
 
     /// Operands with an AccVGPR register or inline constant
     OPERAND_REG_INLINE_AC_INT16,
     OPERAND_REG_INLINE_AC_INT32,
     OPERAND_REG_INLINE_AC_FP16,
     OPERAND_REG_INLINE_AC_FP32,
-    OPERAND_REG_INLINE_AC_V2FP16,
+    OPERAND_REG_INLINE_AC_FP64,
     OPERAND_REG_INLINE_AC_V2INT16,
+    OPERAND_REG_INLINE_AC_V2FP16,
+    OPERAND_REG_INLINE_AC_V2INT32,
+    OPERAND_REG_INLINE_AC_V2FP32,
 
     OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32,
-    OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_V2INT16,
+    OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_V2FP32,
 
     OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16,
-    OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_AC_V2INT16,
+    OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_AC_V2FP32,
 
     OPERAND_REG_INLINE_AC_FIRST = OPERAND_REG_INLINE_AC_INT16,
-    OPERAND_REG_INLINE_AC_LAST = OPERAND_REG_INLINE_AC_V2INT16,
+    OPERAND_REG_INLINE_AC_LAST = OPERAND_REG_INLINE_AC_V2FP32,
 
     OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32,
     OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST,
@@ -680,6 +687,8 @@ enum DppCtrl : unsigned {
   BCAST31           = 0x143,
   DPP_UNUSED8_FIRST = 0x144,
   DPP_UNUSED8_LAST  = 0x14F,
+  ROW_NEWBCAST_FIRST= 0x150,
+  ROW_NEWBCAST_LAST = 0x15F,
   ROW_SHARE_FIRST   = 0x150,
   ROW_SHARE_LAST    = 0x15F,
   ROW_XMASK_FIRST   = 0x160,

diff  --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index d5fa9afded27..a0be4940acf9 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -97,6 +97,9 @@ class SIFoldOperands : public MachineFunctionPass {
 
   std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
   bool tryFoldOMod(MachineInstr &MI);
+  bool tryFoldRegSeqence(MachineInstr &MI);
+  bool tryFoldLCSSAPhi(MachineInstr &MI);
+  bool tryFoldLoad(MachineInstr &MI);
 
 public:
   SIFoldOperands() : MachineFunctionPass(ID) {
@@ -135,6 +138,8 @@ static unsigned macToMad(unsigned Opc) {
     return AMDGPU::V_FMA_F16_gfx9_e64;
   case AMDGPU::V_FMAC_LEGACY_F32_e64:
     return AMDGPU::V_FMA_LEGACY_F32_e64;
+  case AMDGPU::V_FMAC_F64_e64:
+    return AMDGPU::V_FMA_F64_e64;
   }
   return AMDGPU::INSTRUCTION_LIST_END;
 }
@@ -531,8 +536,10 @@ static bool tryToFoldACImm(const SIInstrInfo *TII,
     return false;
 
   uint8_t OpTy = OpInfo[UseOpIdx].OperandType;
-  if (OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST ||
-      OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST)
+  if ((OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST ||
+       OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) &&
+      (OpTy < AMDGPU::OPERAND_REG_INLINE_C_FIRST ||
+       OpTy > AMDGPU::OPERAND_REG_INLINE_C_LAST))
     return false;
 
   if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
@@ -554,6 +561,19 @@ static bool tryToFoldACImm(const SIInstrInfo *TII,
     return false;
 
   MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo();
+
+  // Maybe it is just a COPY of an immediate itself.
+  MachineInstr *Def = MRI.getUniqueVRegDef(UseReg);
+  MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
+  if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
+    MachineOperand &DefOp = Def->getOperand(1);
+    if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) &&
+        TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
+      UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm());
+      return true;
+    }
+  }
+
   SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
   if (!getRegSeqInit(Defs, UseReg, OpTy, TII, MRI))
     return false;
@@ -825,6 +845,10 @@ void SIFoldOperands::foldOperand(
       else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
                TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))
         UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
+      else if (ST->hasGFX90AInsts() &&
+               TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
+               TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))
+        UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32));
       return;
     }
 
@@ -1502,6 +1526,194 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
   return true;
 }
 
+// Try to fold a reg_sequence with vgpr output and agpr inputs into an
+// instruction which can take an agpr. So far that means a store.
+bool SIFoldOperands::tryFoldRegSeqence(MachineInstr &MI) {
+  assert(MI.isRegSequence());
+  auto Reg = MI.getOperand(0).getReg();
+
+  if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
+      !MRI->hasOneNonDBGUse(Reg))
+    return false;
+
+  SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
+  if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER, TII, *MRI))
+    return false;
+
+  for (auto &Def : Defs) {
+    const auto *Op = Def.first;
+    if (!Op->isReg())
+      return false;
+    if (TRI->isAGPR(*MRI, Op->getReg()))
+      continue;
+    // Maybe this is a COPY from AREG
+    const MachineInstr *SubDef = MRI->getUniqueVRegDef(Op->getReg());
+    if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
+      return false;
+    if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
+      return false;
+  }
+
+  MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
+  MachineInstr *UseMI = Op->getParent();
+  while (UseMI->isCopy() && !Op->getSubReg()) {
+    Reg = UseMI->getOperand(0).getReg();
+    if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
+      return false;
+    Op = &*MRI->use_nodbg_begin(Reg);
+    UseMI = Op->getParent();
+  }
+
+  if (Op->getSubReg())
+    return false;
+
+  unsigned OpIdx = Op - &UseMI->getOperand(0);
+  const MCInstrDesc &InstDesc = UseMI->getDesc();
+  const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
+  switch (OpInfo.RegClass) {
+  case AMDGPU::AV_32RegClassID:  LLVM_FALLTHROUGH;
+  case AMDGPU::AV_64RegClassID:  LLVM_FALLTHROUGH;
+  case AMDGPU::AV_96RegClassID:  LLVM_FALLTHROUGH;
+  case AMDGPU::AV_128RegClassID: LLVM_FALLTHROUGH;
+  case AMDGPU::AV_160RegClassID:
+    break;
+  default:
+    return false;
+  }
+
+  const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
+  auto Dst = MRI->createVirtualRegister(NewDstRC);
+  auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+                    TII->get(AMDGPU::REG_SEQUENCE), Dst);
+
+  for (unsigned I = 0; I < Defs.size(); ++I) {
+    MachineOperand *Def = Defs[I].first;
+    Def->setIsKill(false);
+    if (TRI->isAGPR(*MRI, Def->getReg())) {
+      RS.add(*Def);
+    } else { // This is a copy
+      MachineInstr *SubDef = MRI->getUniqueVRegDef(Def->getReg());
+      SubDef->getOperand(1).setIsKill(false);
+      RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
+    }
+    RS.addImm(Defs[I].second);
+  }
+
+  Op->setReg(Dst);
+  if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
+    Op->setReg(Reg);
+    RS->eraseFromParent();
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI << '\n');
+
+  return true;
+}
+
+// Try to hoist an AGPR to VGPR copy out of the loop across a LCSSA PHI.
+// This should allow folding of an AGPR into a consumer which may support it.
+// I.e.:
+//
+// loop:                             // loop:
+//   %1:vreg = COPY %0:areg          // exit:
+// exit:                          => //   %1:areg = PHI %0:areg, %loop
+//   %2:vreg = PHI %1:vreg, %loop    //   %2:vreg = COPY %1:areg
+bool SIFoldOperands::tryFoldLCSSAPhi(MachineInstr &PHI) {
+  assert(PHI.isPHI());
+
+  if (PHI.getNumExplicitOperands() != 3) // Single input LCSSA PHI
+    return false;
+
+  Register PhiIn = PHI.getOperand(1).getReg();
+  Register PhiOut = PHI.getOperand(0).getReg();
+  if (PHI.getOperand(1).getSubReg() ||
+      !TRI->isVGPR(*MRI, PhiIn) || !TRI->isVGPR(*MRI, PhiOut))
+    return false;
+
+  // A single use should not matter for correctness, but if it has another use
+  // inside the loop we may perform copy twice in a worst case.
+  if (!MRI->hasOneNonDBGUse(PhiIn))
+    return false;
+
+  MachineInstr *Copy = MRI->getUniqueVRegDef(PhiIn);
+  if (!Copy || !Copy->isCopy())
+    return false;
+
+  Register CopyIn = Copy->getOperand(1).getReg();
+  if (!TRI->isAGPR(*MRI, CopyIn) || Copy->getOperand(1).getSubReg())
+    return false;
+
+  const TargetRegisterClass *ARC = MRI->getRegClass(CopyIn);
+  Register NewReg = MRI->createVirtualRegister(ARC);
+  PHI.getOperand(1).setReg(CopyIn);
+  PHI.getOperand(0).setReg(NewReg);
+
+  MachineBasicBlock *MBB = PHI.getParent();
+  BuildMI(*MBB, MBB->getFirstNonPHI(), Copy->getDebugLoc(),
+          TII->get(AMDGPU::COPY), PhiOut)
+    .addReg(NewReg, RegState::Kill);
+  Copy->eraseFromParent(); // We know this copy had a single use.
+
+  LLVM_DEBUG(dbgs() << "Folded " << PHI << '\n');
+
+  return true;
+}
+
+// Attempt to convert VGPR load to an AGPR load.
+bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) {
+  assert(MI.mayLoad());
+  if (!ST->hasGFX90AInsts() || !MI.getNumOperands())
+    return false;
+
+  MachineOperand &Def = MI.getOperand(0);
+  if (!Def.isDef())
+    return false;
+
+  Register DefReg = Def.getReg();
+
+  if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
+    return false;
+
+  SmallVector<const MachineInstr*, 8> Users;
+  SmallVector<Register, 8> MoveRegs;
+  for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg)) {
+    Users.push_back(&I);
+  }
+  if (Users.empty())
+    return false;
+
+  // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
+  while (!Users.empty()) {
+    const MachineInstr *I = Users.pop_back_val();
+    if (!I->isCopy() && !I->isRegSequence())
+      return false;
+    Register DstReg = I->getOperand(0).getReg();
+    if (TRI->isAGPR(*MRI, DstReg))
+      continue;
+    MoveRegs.push_back(DstReg);
+    for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg)) {
+      Users.push_back(&U);
+    }
+  }
+
+  const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
+  MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
+  if (!TII->isOperandLegal(MI, 0, &Def)) {
+    MRI->setRegClass(DefReg, RC);
+    return false;
+  }
+
+  while (!MoveRegs.empty()) {
+    Register Reg = MoveRegs.pop_back_val();
+    MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
+  }
+
+  LLVM_DEBUG(dbgs() << "Folded " << MI << '\n');
+
+  return true;
+}
+
 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -1529,6 +1741,15 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
 
       tryFoldInst(TII, &MI);
 
+      if (MI.isRegSequence() && tryFoldRegSeqence(MI))
+        continue;
+
+      if (MI.isPHI() && tryFoldLCSSAPhi(MI))
+        continue;
+
+      if (MI.mayLoad() && tryFoldLoad(MI))
+        continue;
+
       if (!TII->isFoldableCopy(MI)) {
         // Saw an unknown clobber of m0, so we no longer know what it is.
         if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))

diff  --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index b0ec5ac4a5b9..d874aa143f27 100644
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -188,11 +188,24 @@ void SIFormMemoryClauses::forAllLanes(Register Reg, LaneBitmask LaneMask,
     return MaskA.getHighestLane() > MaskB.getHighestLane();
   });
 
+  MCRegister RepReg;
+  for (MCRegister R : *MRI->getRegClass(Reg)) {
+    if (!MRI->isReserved(R)) {
+      RepReg = R;
+      break;
+    }
+  }
+  if (!RepReg)
+    llvm_unreachable("Failed to find required allocatable register");
+
   for (unsigned Idx : CoveringSubregs) {
     LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
     if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none())
       continue;
 
+    if (MRI->isReserved(TRI->getSubReg(RepReg, Idx)))
+      continue;
+
     Func(Idx);
     LaneMask &= ~SubRegMask;
     if (LaneMask.none())
@@ -261,7 +274,7 @@ bool SIFormMemoryClauses::checkPressure(const MachineInstr &MI,
   // tracking does not account for the alignment requirements for SGPRs, or the
   // fragmentation of registers the allocator will need to satisfy.
   if (Occupancy >= MFI->getMinAllowedOccupancy() &&
-      MaxPressure.getVGPRNum() <= MaxVGPRs / 2 &&
+      MaxPressure.getVGPRNum(ST->hasGFX90AInsts()) <= MaxVGPRs / 2 &&
       MaxPressure.getSGPRNum() <= MaxSGPRs / 2) {
     LastRecordedOccupancy = Occupancy;
     return true;

diff  --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 6548dde08065..8d5f5145e26b 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -138,6 +138,7 @@ static void buildPrologSpill(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
         .addImm(0) // glc
         .addImm(0) // slc
         .addImm(0) // dlc
+        .addImm(0) // scc
         .addMemOperand(MMO);
       return;
     }
@@ -152,6 +153,7 @@ static void buildPrologSpill(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
       .addImm(0) // tfe
       .addImm(0) // dlc
       .addImm(0) // swz
+      .addImm(0) // scc
       .addMemOperand(MMO);
     return;
   }
@@ -181,6 +183,7 @@ static void buildPrologSpill(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
         .addImm(0) // glc
         .addImm(0) // slc
         .addImm(0) // dlc
+        .addImm(0) // scc
         .addMemOperand(MMO);
 
     if (!HasOffsetReg) {
@@ -207,6 +210,7 @@ static void buildPrologSpill(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
           .addImm(0) // tfe
           .addImm(0) // dlc
           .addImm(0) // swz
+          .addImm(0) // scc
           .addMemOperand(MMO);
     } else {
       // No free register, use stack pointer and restore afterwards.
@@ -224,6 +228,7 @@ static void buildPrologSpill(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
           .addImm(0) // tfe
           .addImm(0) // dlc
           .addImm(0) // swz
+          .addImm(0) // scc
           .addMemOperand(MMO);
 
       BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SUB_U32), SPReg)
@@ -257,6 +262,7 @@ static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
         .addImm(0) // glc
         .addImm(0) // slc
         .addImm(0) // dlc
+        .addImm(0) // scc
         .addMemOperand(MMO);
       return;
     }
@@ -275,6 +281,7 @@ static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
         .addImm(0) // glc
         .addImm(0) // slc
         .addImm(0) // dlc
+        .addImm(0) // scc
         .addMemOperand(MMO);
     return;
   }
@@ -290,6 +297,7 @@ static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
       .addImm(0) // tfe
       .addImm(0) // dlc
       .addImm(0) // swz
+      .addImm(0) // scc
       .addMemOperand(MMO);
     return;
   }
@@ -313,6 +321,7 @@ static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
     .addImm(0) // tfe
     .addImm(0) // dlc
     .addImm(0) // swz
+    .addImm(0) // scc
     .addMemOperand(MMO);
 }
 
@@ -1311,7 +1320,13 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 
   // Ignore the SGPRs the default implementation found.
-  SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
+  SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
+
+  // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
+  // In gfx908 there was do AGPR loads and stores and thus spilling also
+  // require a temporary VGPR.
+  if (!ST.hasGFX90AInsts())
+    SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
 
   // hasFP only knows about stack objects that already exist. We're now
   // determining the stack slots that will be created, so we have to predict
@@ -1366,7 +1381,7 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
   SavedRegs.reset(MFI->getStackPtrOffsetReg());
 
   const BitVector AllSavedRegs = SavedRegs;
-  SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
+  SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
 
   // If clearing VGPRs changed the mask, we will have some CSR VGPR spills.
   const bool HaveAnyCSRVGPR = SavedRegs != AllSavedRegs;

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ca2b50509bcd..c36ed0902d2f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -718,6 +718,19 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
     setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
     setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
+
+    if (Subtarget->hasPackedFP32Ops()) {
+      setOperationAction(ISD::FADD, MVT::v2f32, Legal);
+      setOperationAction(ISD::FMUL, MVT::v2f32, Legal);
+      setOperationAction(ISD::FMA,  MVT::v2f32, Legal);
+      setOperationAction(ISD::FNEG, MVT::v2f32, Legal);
+
+      for (MVT VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32 }) {
+        setOperationAction(ISD::FADD, VT, Custom);
+        setOperationAction(ISD::FMUL, VT, Custom);
+        setOperationAction(ISD::FMA, VT, Custom);
+      }
+    }
   }
 
   setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
@@ -1128,17 +1141,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                  MachineMemOperand::MOVolatile;
     return true;
   }
-  case Intrinsic::amdgcn_global_atomic_fadd: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(CI.getType());
-    Info.ptrVal = CI.getOperand(0);
-    Info.align.reset();
-    Info.flags = MachineMemOperand::MOLoad |
-                 MachineMemOperand::MOStore |
-                 MachineMemOperand::MODereferenceable |
-                 MachineMemOperand::MOVolatile;
-    return true;
-  }
   case Intrinsic::amdgcn_image_bvh_intersect_ray: {
     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -1150,6 +1152,22 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                  MachineMemOperand::MODereferenceable;
     return true;
   }
+  case Intrinsic::amdgcn_global_atomic_fadd:
+  case Intrinsic::amdgcn_global_atomic_fmin:
+  case Intrinsic::amdgcn_global_atomic_fmax:
+  case Intrinsic::amdgcn_flat_atomic_fadd:
+  case Intrinsic::amdgcn_flat_atomic_fmin:
+  case Intrinsic::amdgcn_flat_atomic_fmax: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(CI.getType());
+    Info.ptrVal = CI.getOperand(0);
+    Info.align.reset();
+    Info.flags = MachineMemOperand::MOLoad |
+                 MachineMemOperand::MOStore |
+                 MachineMemOperand::MODereferenceable |
+                 MachineMemOperand::MOVolatile;
+    return true;
+  }
   case Intrinsic::amdgcn_ds_gws_init:
   case Intrinsic::amdgcn_ds_gws_barrier:
   case Intrinsic::amdgcn_ds_gws_sema_v:
@@ -1191,6 +1209,9 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
   case Intrinsic::amdgcn_ds_fmin:
   case Intrinsic::amdgcn_ds_fmax:
   case Intrinsic::amdgcn_global_atomic_fadd:
+  case Intrinsic::amdgcn_flat_atomic_fadd:
+  case Intrinsic::amdgcn_flat_atomic_fmin:
+  case Intrinsic::amdgcn_flat_atomic_fmax:
   case Intrinsic::amdgcn_global_atomic_csub: {
     Value *Ptr = II->getArgOperand(0);
     AccessTy = II->getType();
@@ -1799,23 +1820,37 @@ void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo,
     MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
 
     CCInfo.AllocateReg(Reg);
-    Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
+    unsigned Mask = (Subtarget->hasPackedTID() &&
+                     Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
+    Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
   }
 
   if (Info.hasWorkItemIDY()) {
-    Register Reg = AMDGPU::VGPR1;
-    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
+    assert(Info.hasWorkItemIDX());
+    if (Subtarget->hasPackedTID()) {
+      Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
+                                                        0x3ff << 10));
+    } else {
+      unsigned Reg = AMDGPU::VGPR1;
+      MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
 
-    CCInfo.AllocateReg(Reg);
-    Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
+      CCInfo.AllocateReg(Reg);
+      Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
+    }
   }
 
   if (Info.hasWorkItemIDZ()) {
-    Register Reg = AMDGPU::VGPR2;
-    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
+    assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
+    if (Subtarget->hasPackedTID()) {
+      Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
+                                                        0x3ff << 20));
+    } else {
+      unsigned Reg = AMDGPU::VGPR2;
+      MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
 
-    CCInfo.AllocateReg(Reg);
-    Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
+      CCInfo.AllocateReg(Reg);
+      Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
+    }
   }
 }
 
@@ -4380,7 +4415,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
                                               SelectionDAG &DAG) const {
   unsigned Opc = Op.getOpcode();
   EVT VT = Op.getValueType();
-  assert(VT == MVT::v4i16 || VT == MVT::v4f16);
+  assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
+         VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32);
 
   SDValue Lo0, Hi0;
   std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -4401,7 +4437,8 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
                                               SelectionDAG &DAG) const {
   unsigned Opc = Op.getOpcode();
   EVT VT = Op.getValueType();
-  assert(VT == MVT::v4i16 || VT == MVT::v4f16);
+  assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
+         VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32);
 
   SDValue Lo0, Hi0;
   std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -6168,6 +6205,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   if (IsGFX10Plus)
     Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
   Ops.push_back(Unorm);
+  if (!IsGFX10Plus)
+    Ops.push_back(DAG.getTargetConstant(0, SDLoc(), MVT::i1));
   if (IsGFX10Plus)
     Ops.push_back(DLC);
   Ops.push_back(GLC);
@@ -6176,8 +6215,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
                 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
   if (IsGFX10Plus)
     Ops.push_back(IsA16 ? True : False);
-  Ops.push_back(TFE);
-  Ops.push_back(LWE);
+  if (!Subtarget->hasGFX90AInsts()) {
+    Ops.push_back(TFE); //tfe
+  } else if (cast<ConstantSDNode>(TFE)->getZExtValue()) {
+    report_fatal_error("TFE is not supported on this GPU");
+  }
+  Ops.push_back(LWE); // lwe
   if (!IsGFX10Plus)
     Ops.push_back(DimInfo->DA ? True : False);
   if (BaseOpcode->HasD16)
@@ -6195,7 +6238,15 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
                                           : AMDGPU::MIMGEncGfx10Default,
                                    NumVDataDwords, NumVAddrDwords);
   } else {
-    if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    if (Subtarget->hasGFX90AInsts()) {
+      Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
+                                     NumVDataDwords, NumVAddrDwords);
+      if (Opcode == -1)
+        report_fatal_error(
+            "requested image instruction is not supported on this GPU");
+    }
+    if (Opcode == -1 &&
+        Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
                                      NumVDataDwords, NumVAddrDwords);
     if (Opcode == -1)
@@ -7062,7 +7113,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
       break;
     case Intrinsic::amdgcn_buffer_atomic_fadd:
-      if (!Op.getValue(0).use_empty()) {
+      if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) {
         DiagnosticInfoUnsupported
           NoFpRet(DAG.getMachineFunction().getFunction(),
                   "return versions of fp atomics not supported",
@@ -7083,6 +7134,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
+  case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
+    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
+  case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
+    return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
+  case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
+    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
+  case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
+    return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
   case Intrinsic::amdgcn_raw_buffer_atomic_add:
@@ -7208,27 +7267,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
                                    Op->getVTList(), Ops, VT, M->getMemOperand());
   }
-  case Intrinsic::amdgcn_global_atomic_fadd: {
-    if (!Op.getValue(0).use_empty()) {
-      DiagnosticInfoUnsupported
-        NoFpRet(DAG.getMachineFunction().getFunction(),
-                "return versions of fp atomics not supported",
-                DL.getDebugLoc(), DS_Error);
-      DAG.getContext()->diagnose(NoFpRet);
-      return SDValue();
-    }
-    MemSDNode *M = cast<MemSDNode>(Op);
-    SDValue Ops[] = {
-      M->getOperand(0), // Chain
-      M->getOperand(2), // Ptr
-      M->getOperand(3)  // Value
-    };
-
-    EVT VT = Op.getOperand(3).getValueType();
-    return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
-                         DAG.getVTList(VT, MVT::Other), Ops,
-                         M->getMemOperand());
-  }
   case Intrinsic::amdgcn_image_bvh_intersect_ray: {
     SDLoc DL(Op);
     MemSDNode *M = cast<MemSDNode>(Op);
@@ -7299,7 +7337,55 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     DAG.setNodeMemRefs(NewNode, {MemRef});
     return SDValue(NewNode, 0);
   }
+  case Intrinsic::amdgcn_global_atomic_fadd:
+    if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) {
+      DiagnosticInfoUnsupported
+        NoFpRet(DAG.getMachineFunction().getFunction(),
+                "return versions of fp atomics not supported",
+                DL.getDebugLoc(), DS_Error);
+      DAG.getContext()->diagnose(NoFpRet);
+      return SDValue();
+    }
+    LLVM_FALLTHROUGH;
+  case Intrinsic::amdgcn_global_atomic_fmin:
+  case Intrinsic::amdgcn_global_atomic_fmax:
+  case Intrinsic::amdgcn_flat_atomic_fadd:
+  case Intrinsic::amdgcn_flat_atomic_fmin:
+  case Intrinsic::amdgcn_flat_atomic_fmax: {
+    MemSDNode *M = cast<MemSDNode>(Op);
+    SDValue Ops[] = {
+      M->getOperand(0), // Chain
+      M->getOperand(2), // Ptr
+      M->getOperand(3)  // Value
+    };
+    unsigned Opcode = 0;
+    switch (IntrID) {
+    case Intrinsic::amdgcn_global_atomic_fadd:
+    case Intrinsic::amdgcn_flat_atomic_fadd: {
+      EVT VT = Op.getOperand(3).getValueType();
+      return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
+                           DAG.getVTList(VT, MVT::Other), Ops,
+                           M->getMemOperand());
+    }
+    case Intrinsic::amdgcn_global_atomic_fmin:
+    case Intrinsic::amdgcn_flat_atomic_fmin: {
+      Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN;
+      break;
+    }
+    case Intrinsic::amdgcn_global_atomic_fmax:
+    case Intrinsic::amdgcn_flat_atomic_fmax: {
+      Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX;
+      break;
+    }
+    default:
+      llvm_unreachable("unhandled atomic opcode");
+    }
+    return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op),
+                                   M->getVTList(), Ops, M->getMemoryVT(),
+                                   M->getMemOperand());
+  }
   default:
+
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrID))
       return lowerImage(Op, ImageDimIntr, DAG, true);
@@ -10813,7 +10899,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
   unsigned NewDmask = 0;
   unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
   unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
-  bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) ||
+  bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
                   Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
   unsigned TFCLane = 0;
   bool HasChain = Node->getNumValues() > 1;
@@ -11768,26 +11854,39 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
     if (Ty->isHalfTy())
       return AtomicExpansionKind::None;
 
-    if (!Ty->isFloatTy())
+    if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy()))
       return AtomicExpansionKind::CmpXChg;
 
     // TODO: Do have these for flat. Older targets also had them for buffers.
     unsigned AS = RMW->getPointerAddressSpace();
 
-    if (AS == AMDGPUAS::GLOBAL_ADDRESS && Subtarget->hasAtomicFaddInsts()) {
+    if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) &&
+         Subtarget->hasAtomicFaddInsts()) {
       if (!fpModeMatchesGlobalFPAtomicMode(RMW) ||
           RMW->getFunction()->getFnAttribute("amdgpu-unsafe-fp-atomics")
               .getValueAsString() != "true")
         return AtomicExpansionKind::CmpXChg;
 
+      if (Subtarget->hasGFX90AInsts())
+        return (Ty->isFloatTy() && AS == AMDGPUAS::FLAT_ADDRESS) ?
+          AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None;
+
+      if (!Subtarget->hasGFX90AInsts() && AS != AMDGPUAS::GLOBAL_ADDRESS)
+        return AtomicExpansionKind::CmpXChg;
+
       return RMW->use_empty() ? AtomicExpansionKind::None :
                                 AtomicExpansionKind::CmpXChg;
     }
 
     // DS FP atomics do repect the denormal mode, but the rounding mode is fixed
     // to round-to-nearest-even.
-    return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ?
-      AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;
+    // The only exception is DS_ADD_F64 which never flushes regardless of mode.
+    if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) {
+      return (Ty->isDoubleTy() && !fpModeMatchesGlobalFPAtomicMode(RMW)) ?
+        AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None;
+    }
+
+    return AtomicExpansionKind::CmpXChg;
   }
   default:
     break;

diff  --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index c5110f8aa013..ffc651f3b86e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -131,7 +131,8 @@ static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
 // We reserve a fixed number of VGPR slots in the scoring tables for
 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
 enum RegisterMapping {
-  SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
+  SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
+  AGPR_OFFSET = 226, // Maximum programmable ArchVGPRs across all targets.
   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
   NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
   EXTRA_VGPR_LDS = 0,     // This is a placeholder the Shader algorithm uses.
@@ -451,8 +452,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
                                             const SIRegisterInfo *TRI,
                                             unsigned OpNo) const {
   const MachineOperand &Op = MI->getOperand(OpNo);
-  assert(Op.isReg());
-  if (!TRI->isInAllocatableClass(Op.getReg()) || TRI->isAGPR(*MRI, Op.getReg()))
+  if (!TRI->isInAllocatableClass(Op.getReg()))
     return {-1, -1};
 
   // A use via a PW operand does not need a waitcnt.
@@ -463,9 +463,11 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
 
   unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST));
 
-  if (TRI->isVGPR(*MRI, Op.getReg())) {
+  if (TRI->isVectorRegister(*MRI, Op.getReg())) {
     assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
     Result.first = Reg - RegisterEncoding.VGPR0;
+    if (TRI->isAGPR(*MRI, Op.getReg()))
+      Result.first += AGPR_OFFSET;
     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
   } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
     assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
@@ -491,7 +493,7 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI,
                                   const MachineRegisterInfo *MRI, unsigned OpNo,
                                   unsigned Val) {
   RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo);
-  assert(TRI->isVGPR(*MRI, MI->getOperand(OpNo).getReg()));
+  assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
   for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
     setRegScore(RegNo, EXP_CNT, Val);
   }
@@ -549,7 +551,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
           const MachineOperand &Op = Inst.getOperand(I);
-          if (Op.isReg() && !Op.isDef() && TRI->isVGPR(*MRI, Op.getReg())) {
+          if (Op.isReg() && !Op.isDef() &&
+              TRI->isVectorRegister(*MRI, Op.getReg())) {
             setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
           }
         }
@@ -606,7 +609,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
       }
       for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
         MachineOperand &MO = Inst.getOperand(I);
-        if (MO.isReg() && !MO.isDef() && TRI->isVGPR(*MRI, MO.getReg())) {
+        if (MO.isReg() && !MO.isDef() &&
+            TRI->isVectorRegister(*MRI, MO.getReg())) {
           setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
         }
       }
@@ -1003,7 +1007,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
         RegInterval Interval =
             ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
 
-        const bool IsVGPR = TRI->isVGPR(*MRI, Op.getReg());
+        const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
         for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
           if (IsVGPR) {
             // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
@@ -1208,6 +1212,10 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
   if (!TII->usesLGKM_CNT(MI))
     return false;
 
+  // If in tgsplit mode then there can be no use of LDS.
+  if (ST->isTgSplitEnabled())
+    return false;
+
   // If there are no memory operands then conservatively assume the flat
   // operation may access LDS.
   if (MI.memoperands_empty())

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 92d148c1677e..c0ec91d78839 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -278,7 +278,7 @@ class VINTRPe <bits<2> op> : Enc32 {
 }
 
 class MIMGe : Enc64 {
-  bits<8> vdata;
+  bits<10> vdata;
   bits<4> dmask;
   bits<1> unorm;
   bits<1> glc;
@@ -294,11 +294,10 @@ class MIMGe : Enc64 {
   let Inst{12} = unorm;
   let Inst{13} = glc;
   let Inst{15} = r128;
-  let Inst{16} = tfe;
   let Inst{17} = lwe;
   let Inst{25} = slc;
   let Inst{31-26} = 0x3c;
-  let Inst{47-40} = vdata;
+  let Inst{47-40} = vdata{7-0};
   let Inst{52-48} = srsrc{6-2};
   let Inst{57-53} = ssamp{6-2};
   let Inst{63} = d16;
@@ -307,9 +306,25 @@ class MIMGe : Enc64 {
 class MIMGe_gfx6789 <bits<8> op> : MIMGe {
   bits<8> vaddr;
   bits<1> da;
+  bits<1> sccb;
+
+  let Inst{0} = op{7};
+  let Inst{7} = sccb;
+  let Inst{14} = da;
+  let Inst{16} = tfe;
+  let Inst{24-18} = op{6-0};
+  let Inst{39-32} = vaddr;
+}
+
+class MIMGe_gfx90a <bits<8> op> : MIMGe {
+  bits<8> vaddr;
+  bits<1> da;
+  bits<1> sccb;
 
   let Inst{0} = op{7};
+  let Inst{7} = sccb;
   let Inst{14} = da;
+  let Inst{16} = vdata{9}; // ACC bit
   let Inst{24-18} = op{6-0};
   let Inst{39-32} = vaddr;
 }
@@ -325,6 +340,7 @@ class MIMGe_gfx10 <bits<8> op> : MIMGe {
   let Inst{2-1} = nsa;
   let Inst{5-3} = dim;
   let Inst{7} = dlc;
+  let Inst{16} = tfe;
   let Inst{24-18} = op{6-0};
   let Inst{39-32} = vaddr0;
   let Inst{62} = a16;

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 7b08c709368f..162bbd744d6b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -576,15 +576,18 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
   if (!Tmp)
     report_fatal_error("Cannot scavenge VGPR to copy to AGPR");
   RS.setRegUsed(Tmp);
-  // Only loop through if there are any free registers left, otherwise
-  // scavenger may report a fatal error without emergency spill slot
-  // or spill with the slot.
-  while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
-    Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
-    if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
-      break;
-    Tmp = Tmp2;
-    RS.setRegUsed(Tmp);
+
+  if (!TII.getSubtarget().hasGFX90AInsts()) {
+    // Only loop through if there are any free registers left, otherwise
+    // scavenger may report a fatal error without emergency spill slot
+    // or spill with the slot.
+    while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
+      Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
+      if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
+        break;
+      Tmp = Tmp2;
+      RS.setRegUsed(Tmp);
+    }
   }
 
   // Insert copy to temporary VGPR.
@@ -782,7 +785,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
-
   if (RC == &AMDGPU::AGPR_32RegClass) {
     if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) {
       BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
@@ -790,6 +792,12 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       return;
     }
 
+    if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+      return;
+    }
+
     // FIXME: Pass should maintain scavenger to avoid scan through the block on
     // every AGPR spill.
     RegScavenger RS;
@@ -797,7 +805,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
-  if (RI.getRegSizeInBits(*RC) == 16) {
+  const unsigned Size = RI.getRegSizeInBits(*RC);
+  if (Size == 16) {
     assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
            AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||
            AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
@@ -863,6 +872,24 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
+  if (RC == &AMDGPU::VReg_64RegClass &&
+      !RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) {
+    if (ST.hasPackedFP32Ops()) {
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
+        .addImm(SISrcMods::OP_SEL_1)
+        .addReg(SrcReg)
+        .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
+        .addReg(SrcReg)
+        .addImm(0) // op_sel_lo
+        .addImm(0) // op_sel_hi
+        .addImm(0) // neg_lo
+        .addImm(0) // neg_hi
+        .addImm(0) // clamp
+        .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
+      return;
+    }
+  }
+
   const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
   if (RI.isSGPRClass(RC)) {
     if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
@@ -873,12 +900,20 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
+  unsigned EltSize = 4;
   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
   if (RI.hasAGPRs(RC)) {
-    Opcode = RI.hasVGPRs(RI.getPhysRegClass(SrcReg)) ?
+    Opcode = (RI.hasVGPRs(RI.getPhysRegClass(SrcReg))) ?
       AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
   } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) {
     Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
+  } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
+             !RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) {
+    // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
+    if (ST.hasPackedFP32Ops()) {
+      Opcode = AMDGPU::V_PK_MOV_B32;
+      EltSize = 8;
+    }
   }
 
   // For the cases where we need an intermediate instruction/temporary register
@@ -890,7 +925,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
     RS.reset(new RegScavenger());
 
-  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, 4);
+  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
 
   // If there is an overlap, we can't kill the super-register on the last
   // instruction, since it will also kill the components made live by this def.
@@ -911,6 +946,23 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx),
                          RI.getSubReg(SrcReg, SubIdx), UseKill, *RS,
                          ImpDefSuper, ImpUseSuper);
+    } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
+      Register DstSubReg = RI.getSubReg(DestReg, SubIdx);
+      Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg)
+        .addImm(SISrcMods::OP_SEL_1)
+        .addReg(SrcSubReg)
+        .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
+        .addReg(SrcSubReg)
+        .addImm(0) // op_sel_lo
+        .addImm(0) // op_sel_hi
+        .addImm(0) // neg_lo
+        .addImm(0) // neg_hi
+        .addImm(0) // clamp
+        .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
+      if (Idx == 0)
+        MIB.addReg(DestReg, RegState::Define | RegState::Implicit);
     } else {
       MachineInstrBuilder Builder =
         BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx))
@@ -1663,20 +1715,49 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     assert(!SrcOp.isFPImm());
     if (SrcOp.isImm()) {
       APInt Imm(64, SrcOp.getImm());
-      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
-        .addImm(Imm.getLoBits(32).getZExtValue())
-        .addReg(Dst, RegState::Implicit | RegState::Define);
-      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
-        .addImm(Imm.getHiBits(32).getZExtValue())
-        .addReg(Dst, RegState::Implicit | RegState::Define);
+      APInt Lo(32, Imm.getLoBits(32).getZExtValue());
+      APInt Hi(32, Imm.getHiBits(32).getZExtValue());
+      if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) {
+        BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
+          .addImm(SISrcMods::OP_SEL_1)
+          .addImm(Lo.getSExtValue())
+          .addImm(SISrcMods::OP_SEL_1)
+          .addImm(Lo.getSExtValue())
+          .addImm(0)  // op_sel_lo
+          .addImm(0)  // op_sel_hi
+          .addImm(0)  // neg_lo
+          .addImm(0)  // neg_hi
+          .addImm(0); // clamp
+      } else {
+        BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
+          .addImm(Lo.getZExtValue())
+          .addReg(Dst, RegState::Implicit | RegState::Define);
+        BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
+          .addImm(Hi.getZExtValue())
+          .addReg(Dst, RegState::Implicit | RegState::Define);
+      }
     } else {
       assert(SrcOp.isReg());
-      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
-        .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
-        .addReg(Dst, RegState::Implicit | RegState::Define);
-      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
-        .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
-        .addReg(Dst, RegState::Implicit | RegState::Define);
+      if (ST.hasPackedFP32Ops() &&
+          !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
+        BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
+          .addImm(SISrcMods::OP_SEL_1) // src0_mod
+          .addReg(SrcOp.getReg())
+          .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod
+          .addReg(SrcOp.getReg())
+          .addImm(0)  // op_sel_lo
+          .addImm(0)  // op_sel_hi
+          .addImm(0)  // neg_lo
+          .addImm(0)  // neg_hi
+          .addImm(0); // clamp
+      } else {
+        BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
+          .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
+          .addReg(Dst, RegState::Implicit | RegState::Define);
+        BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
+          .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
+          .addReg(Dst, RegState::Implicit | RegState::Define);
+      }
     }
     MI.eraseFromParent();
     break;
@@ -1890,7 +1971,6 @@ SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
   unsigned Part = 0;
   MachineInstr *Split[2];
 
-
   for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
     auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
     if (Dst.isPhysical()) {
@@ -2609,6 +2689,7 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
   case AMDGPU::COPY:
   case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
   case AMDGPU::V_ACCVGPR_READ_B32_e64:
+  case AMDGPU::V_ACCVGPR_MOV_B32:
     return true;
   default:
     return false;
@@ -2999,7 +3080,9 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
   unsigned Opc = MI.getOpcode();
   bool IsF16 = false;
   bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
-               Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64;
+               Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
+               Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
+  bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
 
   switch (Opc) {
   default:
@@ -3010,13 +3093,15 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
     LLVM_FALLTHROUGH;
   case AMDGPU::V_MAC_F32_e64:
   case AMDGPU::V_FMAC_F32_e64:
+  case AMDGPU::V_FMAC_F64_e64:
     break;
   case AMDGPU::V_MAC_F16_e32:
   case AMDGPU::V_FMAC_F16_e32:
     IsF16 = true;
     LLVM_FALLTHROUGH;
   case AMDGPU::V_MAC_F32_e32:
-  case AMDGPU::V_FMAC_F32_e32: {
+  case AMDGPU::V_FMAC_F32_e32:
+  case AMDGPU::V_FMAC_F64_e32: {
     int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
                                              AMDGPU::OpName::src0);
     const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
@@ -3042,7 +3127,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
   const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
   MachineInstrBuilder MIB;
 
-  if (!Src0Mods && !Src1Mods && !Clamp && !Omod &&
+  if (!Src0Mods && !Src1Mods && !Clamp && !Omod && !IsF64 &&
       // If we have an SGPR input, we will violate the constant bus restriction.
       (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
        !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
@@ -3090,7 +3175,9 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
     }
   }
 
-  unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64 : AMDGPU::V_FMA_F32_e64)
+  unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64
+                                   : IsF64 ? AMDGPU::V_FMA_F64_e64
+                                           : AMDGPU::V_FMA_F32_e64)
                           : (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64);
   if (pseudoToMCOpcode(NewOpc) == -1)
     return nullptr;
@@ -3278,6 +3365,10 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
   case AMDGPU::OPERAND_REG_IMM_FP32:
   case AMDGPU::OPERAND_REG_INLINE_C_INT32:
   case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+  case AMDGPU::OPERAND_REG_IMM_V2FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
+  case AMDGPU::OPERAND_REG_IMM_V2INT32:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
   case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
   case AMDGPU::OPERAND_REG_INLINE_AC_FP32: {
     int32_t Trunc = static_cast<int32_t>(Imm);
@@ -3287,6 +3378,7 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
   case AMDGPU::OPERAND_REG_IMM_FP64:
   case AMDGPU::OPERAND_REG_INLINE_C_INT64:
   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
     return AMDGPU::isInlinableLiteral64(MO.getImm(),
                                         ST.hasInv2PiInlineImm());
   case AMDGPU::OPERAND_REG_IMM_INT16:
@@ -3398,6 +3490,10 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
 }
 
 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
+  // GFX90A does not have V_MUL_LEGACY_F32_e32.
+  if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
+    return false;
+
   int Op32 = AMDGPU::getVOPe32(Opcode);
   if (Op32 == -1)
     return false;
@@ -3455,6 +3551,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
       case AMDGPU::V_MAC_F16_e64:
       case AMDGPU::V_FMAC_F32_e64:
       case AMDGPU::V_FMAC_F16_e64:
+      case AMDGPU::V_FMAC_F64_e64:
         if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
             hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
           return false;
@@ -3706,7 +3803,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
     case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
     case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
-    case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
+    case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
+    case AMDGPU::OPERAND_REG_INLINE_AC_FP64: {
       const MachineOperand &MO = MI.getOperand(i);
       if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
         ErrInfo = "Illegal immediate value for operand.";
@@ -4203,12 +4301,68 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
     if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
         ST.getGeneration() < AMDGPUSubtarget::GFX10) {
+      if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
+          DC <= DppCtrl::ROW_NEWBCAST_LAST &&
+          !ST.hasGFX90AInsts()) {
+        ErrInfo = "Invalid dpp_ctrl value: "
+                  "row_newbroadcast/row_share is not supported before "
+                  "GFX90A/GFX10";
+        return false;
+      } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
+        ErrInfo = "Invalid dpp_ctrl value: "
+                  "row_share and row_xmask are not supported before GFX10";
+        return false;
+      }
+    }
+
+    int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
+    int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+
+    if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
+        ((DstIdx >= 0 &&
+          Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64RegClassID) ||
+        ((Src0Idx >= 0 &&
+          Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID))) &&
+        !AMDGPU::isLegal64BitDPPControl(DC)) {
       ErrInfo = "Invalid dpp_ctrl value: "
-                "row_share and row_xmask are not supported before GFX10";
+                "64 bit dpp only support row_newbcast";
       return false;
     }
   }
 
+  if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
+    const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
+    uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
+                                        : AMDGPU::OpName::vdata;
+    const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
+    const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
+    if (Data && !Data->isReg())
+      Data = nullptr;
+
+    if (ST.hasGFX90AInsts()) {
+      if (Dst && Data &&
+          (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
+        ErrInfo = "Invalid register class: "
+                  "vdata and vdst should be both VGPR or AGPR";
+        return false;
+      }
+      if (Data && Data2 &&
+          (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
+        ErrInfo = "Invalid register class: "
+                  "both data operands should be VGPR or AGPR";
+        return false;
+      }
+    } else {
+      if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
+          (Data && RI.isAGPR(MRI, Data->getReg())) ||
+          (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
+        ErrInfo = "Invalid register class: "
+                  "agpr loads and stores not supported on this GPU";
+        return false;
+      }
+    }
+  }
+
   return true;
 }
 
@@ -4292,6 +4446,59 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
       "Unexpected scalar opcode without corresponding vector one!");
 }
 
+static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST,
+                                          const MachineRegisterInfo &MRI,
+                                          const MCInstrDesc &TID,
+                                          unsigned RCID,
+                                          bool IsAllocatable) {
+  if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
+      (TID.mayLoad() || TID.mayStore() ||
+      (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
+    switch (RCID) {
+    case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID;
+    case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID;
+    case AMDGPU::AV_96RegClassID: return AMDGPU::VReg_96RegClassID;
+    case AMDGPU::AV_128RegClassID: return AMDGPU::VReg_128RegClassID;
+    case AMDGPU::AV_160RegClassID: return AMDGPU::VReg_160RegClassID;
+    default:
+      break;
+    }
+  }
+  return RCID;
+}
+
+const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
+    unsigned OpNum, const TargetRegisterInfo *TRI,
+    const MachineFunction &MF)
+  const {
+  if (OpNum >= TID.getNumOperands())
+    return nullptr;
+  auto RegClass = TID.OpInfo[OpNum].RegClass;
+  bool IsAllocatable = false;
+  if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) {
+    // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
+    // with two data operands. Request register class constainted to VGPR only
+    // of both operands present as Machine Copy Propagation can not check this
+    // constraint and possibly other passes too.
+    //
+    // The check is limited to FLAT and DS because atomics in non-flat encoding
+    // have their vdst and vdata tied to be the same register.
+    const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
+                                                   AMDGPU::OpName::vdst);
+    const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
+        (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
+                                         : AMDGPU::OpName::vdata);
+    if (DataIdx != -1) {
+      IsAllocatable = VDstIdx != -1 ||
+                      AMDGPU::getNamedOperandIdx(TID.Opcode,
+                                                 AMDGPU::OpName::data1) != -1;
+    }
+  }
+  RegClass = adjustAllocatableRegClass(ST, MF.getRegInfo(), TID, RegClass,
+                                       IsAllocatable);
+  return RI.getRegClass(RegClass);
+}
+
 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
                                                       unsigned OpNo) const {
   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
@@ -4306,6 +4513,7 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
   }
 
   unsigned RCID = Desc.OpInfo[OpNo].RegClass;
+  RCID = adjustAllocatableRegClass(ST, MRI, Desc, RCID, true);
   return RI.getRegClass(RCID);
 }
 
@@ -4482,7 +4690,40 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
 
   if (MO->isReg()) {
     assert(DefinedRC);
-    return isLegalRegOperand(MRI, OpInfo, *MO);
+    if (!isLegalRegOperand(MRI, OpInfo, *MO))
+      return false;
+    bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
+    if (IsAGPR && !ST.hasMAIInsts())
+      return false;
+    unsigned Opc = MI.getOpcode();
+    if (IsAGPR &&
+        (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
+        (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
+      return false;
+    // Atomics should have both vdst and vdata either vgpr or agpr.
+    const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+    const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
+        isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
+    if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
+        MI.getOperand(DataIdx).isReg() &&
+        RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
+      return false;
+    if ((int)OpIdx == DataIdx) {
+      if (VDstIdx != -1 &&
+          RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
+        return false;
+      // DS instructions with 2 src operands also must have tied RC.
+      const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
+                                                      AMDGPU::OpName::data1);
+      if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
+          RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
+        return false;
+    }
+    if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+        (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
+        RI.isSGPRReg(MRI, MO->getReg()))
+      return false;
+    return true;
   }
 
   // Handle non-register types that are treated like immediates.
@@ -5341,6 +5582,10 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
                 getNamedOperand(MI, AMDGPU::OpName::dlc)) {
           MIB.addImm(DLC->getImm());
         }
+        if (const MachineOperand *SCCB =
+                getNamedOperand(MI, AMDGPU::OpName::sccb)) {
+          MIB.addImm(SCCB->getImm());
+        }
 
         MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
 
@@ -7085,7 +7330,8 @@ enum SIEncodingFamily {
   GFX80 = 4,
   GFX9 = 5,
   GFX10 = 6,
-  SDWA10 = 7
+  SDWA10 = 7,
+  GFX90A = 8
 };
 
 static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
@@ -7157,6 +7403,15 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
   if (MCOp == -1)
     return Opcode;
 
+  if (ST.hasGFX90AInsts()) {
+    uint16_t NMCOp = (uint16_t)-1;
+      NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A);
+    if (NMCOp == (uint16_t)-1)
+      NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9);
+    if (NMCOp != (uint16_t)-1)
+      MCOp = NMCOp;
+  }
+
   // (uint16_t)-1 means that Opcode is a pseudo instruction that has
   // no encoding in the given subtarget generation.
   if (MCOp == (uint16_t)-1)

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 9bcda2ec4af3..49cfae590e4b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -171,6 +171,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     return RI;
   }
 
+  const GCNSubtarget &getSubtarget() const {
+    return ST;
+  }
+
   bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
                                          AAResults *AA) const override;
 
@@ -1085,11 +1089,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   const TargetRegisterClass *getRegClass(const MCInstrDesc &TID, unsigned OpNum,
                                          const TargetRegisterInfo *TRI,
                                          const MachineFunction &MF)
-    const override {
-    if (OpNum >= TID.getNumOperands())
-      return nullptr;
-    return RI.getRegClass(TID.OpInfo[OpNum].RegClass);
-  }
+    const override;
 
   void fixImplicitOperands(MachineInstr &MI) const;
 

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 8b557cccf7d7..b616228095f7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -28,6 +28,7 @@ def SIEncodingFamily {
   int GFX9 = 5;
   int GFX10 = 6;
   int SDWA10 = 7;
+  int GFX90A = 8;
 }
 
 //===----------------------------------------------------------------------===//
@@ -186,6 +187,8 @@ def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">;
 def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">;
 def SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">;
 def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">;
+def SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">;
+def SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">;
 
 def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
   SDTypeProfile<1, 9,
@@ -265,21 +268,25 @@ class isFloatType<ValueType SrcVT> {
                 !eq(SrcVT.Value, v2f16.Value),
                 !eq(SrcVT.Value, v4f16.Value),
                 !eq(SrcVT.Value, v2f32.Value),
-                !eq(SrcVT.Value, v2f64.Value));
+                !eq(SrcVT.Value, v2f64.Value),
+                !eq(SrcVT.Value, v4f64.Value));
 }
 
 class isIntType<ValueType SrcVT> {
   bit ret = !or(!eq(SrcVT.Value, i16.Value),
                 !eq(SrcVT.Value, i32.Value),
-                !eq(SrcVT.Value, i64.Value));
+                !eq(SrcVT.Value, i64.Value),
+                !eq(SrcVT.Value, v2i32.Value));
 }
 
 class isPackedType<ValueType SrcVT> {
   bit ret = !or(!eq(SrcVT.Value, v2i16.Value),
                 !eq(SrcVT.Value, v2f16.Value),
-                !eq(SrcVT.Value, v4f16.Value));
+                !eq(SrcVT.Value, v4f16.Value),
+                !eq(SrcVT.Value, v2f32.Value));
 }
 
+
 //===----------------------------------------------------------------------===//
 // PatFrags for global memory operations
 //===----------------------------------------------------------------------===//
@@ -822,6 +829,10 @@ def extract_swz : SDNodeXForm<timm, [{
   return CurDAG->getTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8);
 }]>;
 
+def extract_sccb : SDNodeXForm<timm, [{
+  return CurDAG->getTargetConstant((N->getZExtValue() >> 4) & 1, SDLoc(N), MVT::i8);
+}]>;
+
 //===----------------------------------------------------------------------===//
 // Custom Operands
 //===----------------------------------------------------------------------===//
@@ -1097,6 +1108,9 @@ def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>;
 def clampmod0 : NamedOperandBit_0<"ClampSI", NamedMatchClass<"ClampSI">>;
 def highmod : NamedOperandBit<"High", NamedMatchClass<"High">>;
 
+def SCCB : NamedOperandBit<"SCCB", NamedMatchClass<"SCCB">>;
+def SCCB_0 : NamedOperandBit_0<"SCCB", NamedMatchClass<"SCCB">>;
+
 def DLC : NamedOperandBit<"DLC", NamedMatchClass<"DLC">>;
 def DLC_0 : NamedOperandBit_0<"DLC", NamedMatchClass<"DLC">>;
 
@@ -1243,7 +1257,7 @@ def FP32SDWAInputMods : FPSDWAInputMods<FP32SDWAInputModsMatchClass>;
 def FPVRegInputModsMatchClass : AsmOperandClass {
   let Name = "VRegWithFPInputMods";
   let ParserMethod = "parseRegWithFPInputMods";
-  let PredicateMethod = "isVReg32";
+  let PredicateMethod = "isVRegWithInputMods";
 }
 
 def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
@@ -1270,7 +1284,7 @@ def Int32SDWAInputMods : IntSDWAInputMods<Int32SDWAInputModsMatchClass>;
 def IntVRegInputModsMatchClass : AsmOperandClass {
   let Name = "VRegWithIntInputMods";
   let ParserMethod = "parseRegWithIntInputMods";
-  let PredicateMethod = "isVReg32";
+  let PredicateMethod = "isVRegWithInputMods";
 }
 
 def IntVRegInputMods : InputMods <IntVRegInputModsMatchClass> {
@@ -1507,8 +1521,12 @@ class getVOP3SrcForVT<ValueType VT> {
      VSrc_128,
      !if(!eq(VT.Size, 64),
         !if(isFP,
-           VSrc_f64,
-           VSrc_b64),
+           !if(!eq(VT.Value, v2f32.Value),
+               VSrc_v2f32,
+               VSrc_f64),
+           !if(!eq(VT.Value, v2i32.Value),
+               VSrc_v2b32,
+           VSrc_b64)),
         !if(!eq(VT.Value, i1.Value),
            SSrc_i1,
            !if(isFP,
@@ -1541,7 +1559,9 @@ class isModifierType<ValueType SrcVT> {
                 !eq(SrcVT.Value, f32.Value),
                 !eq(SrcVT.Value, f64.Value),
                 !eq(SrcVT.Value, v2f16.Value),
-                !eq(SrcVT.Value, v2i16.Value));
+                !eq(SrcVT.Value, v2i16.Value),
+                !eq(SrcVT.Value, v2f32.Value),
+                !eq(SrcVT.Value, v2i32.Value));
 }
 
 // Return type of input modifiers operand for specified input operand
@@ -1972,14 +1992,29 @@ class getAsmSDWA9 <bit HasDst, bit HasOMod, int NumSrcArgs,
   string ret = dst#args#sdwa;
 }
 
+class getHas64BitOps <int NumSrcArgs, ValueType DstVT, ValueType Src0VT,
+                      ValueType Src1VT> {
+  bit ret = !if(!eq(NumSrcArgs, 3),
+                0,
+                !if(!eq(DstVT.Size, 64),
+                    1,
+                    !if(!eq(Src0VT.Size, 64),
+                        1,
+                        !if(!eq(Src1VT.Size, 64),
+                            1,
+                            0
+                        )
+                    )
+                )
+            );
+}
 
-// Function that checks if instruction supports DPP and SDWA
-class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
-                 ValueType Src1VT = i32> {
+class getHasSDWA <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
+                  ValueType Src1VT = i32> {
   bit ret = !if(!eq(NumSrcArgs, 3),
-                0, // NumSrcArgs == 3 - No DPP or SDWA for VOP3
+                0, // NumSrcArgs == 3 - No SDWA for VOP3
                 !if(!eq(DstVT.Size, 64),
-                    0, // 64-bit dst - No DPP or SDWA for 64-bit operands
+                    0, // 64-bit dst - No SDWA for 64-bit operands
                     !if(!eq(Src0VT.Size, 64),
                         0, // 64-bit src0
                         !if(!eq(Src1VT.Size, 64),
@@ -1993,8 +2028,42 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
 
 class getHasDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
                  ValueType Src1VT = i32> {
-  bit ret = !if(!eq(NumSrcArgs, 0), 0,
-                getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
+  bit ret = !if(!eq(NumSrcArgs, 3),
+                0, // NumSrcArgs == 3 - No DPP for VOP3
+                1);
+}
+
+class getHasExt64BitDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
+                 ValueType Src1VT = i32> {
+  bit ret = !and(getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret,
+                 getHas64BitOps<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
+}
+
+// Function that checks if instruction supports DPP and SDWA
+class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
+                 ValueType Src1VT = i32> {
+  bit ret = !or(getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret,
+                getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
+}
+
+// Return an AGPR+VGPR operand class for the given VGPR register class.
+class getLdStRegisterOperand<RegisterClass RC> {
+  RegisterOperand ret =
+    !if(!eq(RC.Size, 32), AVLdSt_32,
+      !if(!eq(RC.Size, 64), AVLdSt_64,
+        !if(!eq(RC.Size, 96), AVLdSt_96,
+          !if(!eq(RC.Size, 128), AVLdSt_128,
+            !if(!eq(RC.Size, 160), AVLdSt_160,
+              RegisterOperand<VReg_1> // invalid register
+    )))));
+}
+
+class BitOr<bit a, bit b> {
+  bit ret = !if(a, 1, !if(b, 1, 0));
+}
+
+class BitAnd<bit a, bit b> {
+  bit ret = !if(a, !if(b, 1, 0), 0);
 }
 
 def PatGenMode {
@@ -2077,8 +2146,9 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
 
   field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
   field bit HasExtDPP = getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
-  field bit HasExtSDWA = HasExt;
-  field bit HasExtSDWA9 = HasExt;
+  field bit HasExt64BitDPP = getHasExt64BitDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
+  field bit HasExtSDWA = getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
+  field bit HasExtSDWA9 = HasExtSDWA;
   field int NeedPatGen = PatGenMode.NoPattern;
 
   field bit IsMAI = 0;
@@ -2144,6 +2214,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
 class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
   let HasExt = 0;
   let HasExtDPP = 0;
+  let HasExt64BitDPP = 0;
   let HasExtSDWA = 0;
   let HasExtSDWA9 = 0;
 }
@@ -2191,6 +2262,7 @@ def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>;
 def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>;
 def VOP_F16_F32 : VOPProfile <[f16, f32, untyped, untyped]>;
 def VOP_F32_F16 : VOPProfile <[f32, f16, untyped, untyped]>;
+def VOP_I64_I64 : VOPProfile <[i64, i64, untyped, untyped]>;
 
 def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>;
 def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>;
@@ -2234,6 +2306,16 @@ def VOP_V4I32_I32_I32_V4I32       : VOPProfile <[v4i32,  i32,   i32,   v4i32]>;
 def VOP_V16I32_I32_I32_V16I32     : VOPProfile <[v16i32, i32,   i32,   v16i32]>;
 def VOP_V32I32_I32_I32_V32I32     : VOPProfile <[v32i32, i32,   i32,   v32i32]>;
 
+def VOP_V4F64_F64_F64_V4F64 : VOPProfile <[v4f64, f64, f64, v4f64]>;
+def VOP_V1F64_F64_F64_V1F64 : VOPProfile <[v1f64, f64, f64, v1f64]>;
+
+def VOP_V2F32_V2F32_V2F32_V2F32   : VOPProfile <[v2f32,  v2f32, v2f32, v2f32]>;
+def VOP_V2F32_V2F32_V2F32         : VOPProfile <[v2f32,  v2f32, v2f32, untyped]>;
+def VOP_V2I32_V2I32_V2I32         : VOPProfile <[v2i32,  v2i32, v2i32, untyped]>;
+def VOP_V4F32_V4I16_V4I16_V4F32   : VOPProfile <[v4f32,  v4i16, v4i16, v4f32]>;
+def VOP_V16F32_V4I16_V4I16_V16F32 : VOPProfile <[v16f32, v4i16, v4i16, v16f32]>;
+def VOP_V32F32_V4I16_V4I16_V32F32 : VOPProfile <[v32f32, v4i16, v4i16, v32f32]>;
+
 class Commutable_REV <string revOp, bit isOrig> {
   string RevOp = revOp;
   bit IsOrig = isOrig;
@@ -2372,7 +2454,8 @@ def getMCOpcodeGen : InstrMapping {
                    [!cast<string>(SIEncodingFamily.GFX80)],
                    [!cast<string>(SIEncodingFamily.GFX9)],
                    [!cast<string>(SIEncodingFamily.GFX10)],
-                   [!cast<string>(SIEncodingFamily.SDWA10)]];
+                   [!cast<string>(SIEncodingFamily.SDWA10)],
+                   [!cast<string>(SIEncodingFamily.GFX90A)]];
 }
 
 // Get equivalent SOPK instruction.

diff  --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index bdc6b94ca6d6..2482649f58c6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -41,18 +41,21 @@ multiclass V_INTERP_P1_F32_m : VINTRP_m <
                    (i32 timm:$attrchan), (i32 timm:$attr), M0))]
 >;
 
-let OtherPredicates = [has32BankLDS] in {
+let OtherPredicates = [has32BankLDS, isNotGFX90APlus] in {
 
 defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m;
 
-} // End OtherPredicates = [has32BankLDS]
+} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus]
 
-let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in {
+let OtherPredicates = [has16BankLDS, isNotGFX90APlus],
+    Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in {
 
 defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;
 
-} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1
+} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus],
+  //     Constraints = "@earlyclobber $vdst", isAsmParserOnly=1
 
+let OtherPredicates = [isNotGFX90APlus] in {
 let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {
 
 defm V_INTERP_P2_F32 : VINTRP_m <
@@ -73,6 +76,8 @@ defm V_INTERP_MOV_F32 : VINTRP_m <
   [(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc),
                    (i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
 
+} // End OtherPredicates = [isNotGFX90APlus]
+
 } // End Uses = [MODE, M0, EXEC]
 
 //===----------------------------------------------------------------------===//
@@ -86,11 +91,6 @@ def ATOMIC_FENCE : SPseudoInstSI<
   let maybeAtomic = 1;
 }
 
-def VOP_I64_I64_DPP : VOPProfile <[i64, i64, untyped, untyped]> {
-  let HasExt = 1;
-  let HasExtDPP = 1;
-}
-
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
 
 // For use in patterns
@@ -107,7 +107,7 @@ def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
                                       (ins VSrc_b64:$src0)>;
 
 // 64-bit vector move with dpp. Expanded post-RA.
-def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64_DPP> {
+def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64> {
   let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete.
 }
 
@@ -1373,6 +1373,19 @@ def : GCNPat <
 //     sub1)
 // >;
 
+// COPY_TO_REGCLASS is needed to avoid using SCC from S_XOR_B32 instead
+// of the real value.
+def : GCNPat <
+  (fneg (v2f32 SReg_64:$src)),
+  (v2f32 (REG_SEQUENCE SReg_64,
+         (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub0)),
+                                           (i32 (S_MOV_B32 (i32 0x80000000)))),
+                                 SReg_32)), sub0,
+         (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub1)),
+                                           (i32 (S_MOV_B32 (i32 0x80000000)))),
+                                 SReg_32)), sub1))
+>;
+
 } // End let AddedComplexity = 1
 
 def : GCNPat <
@@ -1437,6 +1450,15 @@ def : GCNPat <
     sub1)
 >;
 
+def : GCNPat <
+  (getDivergentFrag<fneg>.ret (v2f32 VReg_64:$src)),
+  (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src,
+                11 /* OP_SEL_1 | NEG_LO | HEG_HI */, 0,
+                0, 0, 0, 0, 0)
+> {
+  let SubtargetPredicate = HasPackedFP32Ops;
+}
+
 def : GCNPat <
   (fcopysign f16:$src0, f16:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
@@ -1556,9 +1578,16 @@ def : GCNPat <
 /********** Intrinsic Patterns **********/
 /********** ================== **********/
 
+let OtherPredicates = [isNotGFX90APlus] in
 // FIXME: Should use _e64 and select source modifiers.
 def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
 
+let OtherPredicates = [isGFX90APlus] in
+def : GCNPat <
+  (fpow f32:$src0, f32:$src1),
+  (V_EXP_F32_e32 (V_MUL_LEGACY_F32_e64 0, f32:$src1, SRCMODS.NONE, (V_LOG_F32_e32 f32:$src0), 0, 0))
+>;
+
 def : GCNPat <
   (i32 (sext i1:$src0)),
   (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
@@ -2167,6 +2196,17 @@ def : GCNPat <
                   SRCMODS.NONE, $src2)
 >;
 
+let SubtargetPredicate = isGFX90APlus in
+def : GCNPat <
+  (fma (f64 (VOP3Mods0 f64:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+       (f64 (VOP3Mods f64:$src1, i32:$src1_modifiers)),
+       (f64 (VOP3NoMods f64:$src2))),
+  (V_FMAC_F64_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
+                  SRCMODS.NONE, $src2, $clamp, $omod)
+>;
+
+// COPY is workaround tablegen bug from multiple outputs
+// from S_LSHL_B32's multiple outputs from implicit scc def.
 def : GCNPat <
   (v2i16 (build_vector (i16 0), (i16 SReg_32:$src1))),
   (S_LSHL_B32 SReg_32:$src1, (i16 16))
@@ -2652,6 +2692,8 @@ def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
 def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
 def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
 def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_FMIN : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_FMAX : BufferAtomicGenericInstruction;
 
 def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction {
   let OutOperandList = (outs type0:$dst);

diff  --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index b959bb3eb9c6..4545994726e5 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -107,6 +107,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
     bool GLC;
     bool SLC;
     bool DLC;
+    bool SCCB; // vmem only.
     bool UseST64;
     int AddrIdx[MaxAddressRegs];
     const MachineOperand *AddrReg[MaxAddressRegs];
@@ -199,6 +200,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
                                                      const CombineInfo &Paired);
   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
                                                     const CombineInfo &Paired);
+  const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
 
   bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
                             SmallVectorImpl<MachineInstr *> &InstsToMove);
@@ -304,6 +306,16 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
     return 2;
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
     return 4;
+  case AMDGPU::DS_READ_B32:      LLVM_FALLTHROUGH;
+  case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
+  case AMDGPU::DS_WRITE_B32:     LLVM_FALLTHROUGH;
+  case AMDGPU::DS_WRITE_B32_gfx9:
+    return 1;
+  case AMDGPU::DS_READ_B64:      LLVM_FALLTHROUGH;
+  case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH;
+  case AMDGPU::DS_WRITE_B64:     LLVM_FALLTHROUGH;
+  case AMDGPU::DS_WRITE_B64_gfx9:
+    return 2;
   default:
     return 0;
   }
@@ -526,6 +538,9 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
       SLC = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm();
     }
     DLC = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm();
+    if (InstClass != S_BUFFER_LOAD_IMM) {
+      SCCB = TII.getNamedOperand(*I, AMDGPU::OpName::sccb)->getImm();
+    }
   }
 
   AddressRegs Regs = getRegs(Opc, TII);
@@ -784,7 +799,8 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
     return (EltOffset0 + CI.Width == EltOffset1 ||
             EltOffset1 + Paired.Width == EltOffset0) &&
            CI.GLC == Paired.GLC && CI.DLC == Paired.DLC &&
-           (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC);
+           (CI.InstClass == S_BUFFER_LOAD_IMM ||
+               (CI.SLC == Paired.SLC && CI.SCCB == Paired.SCCB));
   }
 
   // If the offset in elements doesn't fit in 8-bits, we might be able to use
@@ -864,6 +880,26 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
   }
 }
 
+const TargetRegisterClass *
+SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
+  if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
+    return TRI->getRegClassForReg(*MRI, Dst->getReg());
+  }
+  if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
+    return TRI->getRegClassForReg(*MRI, Src->getReg());
+  }
+  if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
+    return TRI->getRegClassForReg(*MRI, Src->getReg());
+  }
+  if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
+    return TRI->getRegClassForReg(*MRI, Dst->getReg());
+  }
+  if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
+    return TRI->getRegClassForReg(*MRI, Src->getReg());
+  }
+  return nullptr;
+}
+
 /// This function assumes that CI comes before Paired in a basic block.
 bool SILoadStoreOptimizer::checkAndPrepareMerge(
     CombineInfo &CI, CombineInfo &Paired,
@@ -896,6 +932,9 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge(
   DenseSet<Register> PhysRegUsesToMove;
   addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
 
+  const TargetRegisterClass *DataRC = getDataRegClass(*CI.I);
+  bool IsAGPR = TRI->hasAGPRs(DataRC);
+
   MachineBasicBlock::iterator E = std::next(Paired.I);
   MachineBasicBlock::iterator MBBI = std::next(CI.I);
   MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
@@ -964,6 +1003,17 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge(
       continue;
 
     if (&*MBBI == &*Paired.I) {
+      if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR)
+        return false;
+      // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
+      //        operands. However we are reporting that ds_write2 shall have
+      //        only VGPR data so that machine copy propagation does not
+      //        create an illegal instruction with a VGPR and AGPR sources.
+      //        Consequenctially if we create such instruction the verifier
+      //        will complain.
+      if (IsAGPR && CI.InstClass == DS_WRITE)
+        return false;
+
       // We need to go through the list of instructions that we plan to
       // move and make sure they are all safe to move down past the merged
       // instruction.
@@ -1037,8 +1087,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
 
   const MCInstrDesc &Read2Desc = TII->get(Opc);
 
-  const TargetRegisterClass *SuperRC =
-      (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
+  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
   Register DestReg = MRI->createVirtualRegister(SuperRC);
 
   DebugLoc DL = CI.I->getDebugLoc();
@@ -1317,6 +1366,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
         .addImm(0)            // tfe
         .addImm(CI.DLC)      // dlc
         .addImm(0)            // swz
+        .addImm(CI.SCCB)     // scc
         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
 
   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
@@ -1384,6 +1434,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
           .addImm(0)            // tfe
           .addImm(CI.DLC)      // dlc
           .addImm(0)            // swz
+          .addImm(CI.SCCB)     // scc
           .addMemOperand(
               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
 
@@ -1464,6 +1515,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
           .addImm(0)                                // tfe
           .addImm(CI.DLC)                          // dlc
           .addImm(0)                                // swz
+          .addImm(CI.SCCB)                         // scc
           .addMemOperand(
               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
 
@@ -1559,18 +1611,27 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
     case 16:
       return &AMDGPU::SGPR_512RegClass;
     }
-  } else {
-    switch (CI.Width + Paired.Width) {
-    default:
-      return nullptr;
-    case 2:
-      return &AMDGPU::VReg_64RegClass;
-    case 3:
-      return &AMDGPU::VReg_96RegClass;
-    case 4:
-      return &AMDGPU::VReg_128RegClass;
-    }
   }
+  const TargetRegisterClass *RC = nullptr;
+
+  switch (CI.Width + Paired.Width) {
+  default:
+    return nullptr;
+  case 2:
+    RC = &AMDGPU::VReg_64RegClass;
+    break;
+  case 3:
+    RC = &AMDGPU::VReg_96RegClass;
+    break;
+  case 4:
+    RC = &AMDGPU::VReg_128RegClass;
+    break;
+  }
+
+  if (TRI->hasAGPRs(getDataRegClass(*CI.I)))
+    return TRI->getEquivalentAGPRClass(RC);
+
+  return RC;
 }
 
 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
@@ -1624,6 +1685,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
         .addImm(0)            // tfe
         .addImm(CI.DLC)      // dlc
         .addImm(0)            // swz
+        .addImm(CI.SCCB)     // scc
         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
 
   moveInstsAfter(MIB, InstsToMove);

diff  --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index e7158a72fc0e..d4dd5a381a71 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -300,6 +300,20 @@ class SICacheControl {
                                      SIAtomicScope Scope,
                                      SIAtomicAddrSpace AddrSpace) const = 0;
 
+  /// Update \p MI memory store instruction to bypass any caches up to
+  /// the \p Scope memory scope for address spaces \p
+  /// AddrSpace. Return true iff the instruction was modified.
+  virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
+                                      SIAtomicScope Scope,
+                                      SIAtomicAddrSpace AddrSpace) const = 0;
+
+  /// Update \p MI memory read-modify-write instruction to bypass any caches up
+  /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
+  /// iff the instruction was modified.
+  virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
+                                    SIAtomicScope Scope,
+                                    SIAtomicAddrSpace AddrSpace) const = 0;
+
   /// Update \p MI memory instruction of kind \p Op associated with address
   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
   /// true iff the instruction was modified.
@@ -372,6 +386,14 @@ class SIGfx6CacheControl : public SICacheControl {
                              SIAtomicScope Scope,
                              SIAtomicAddrSpace AddrSpace) const override;
 
+  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
+                              SIAtomicScope Scope,
+                              SIAtomicAddrSpace AddrSpace) const override;
+
+  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
+                            SIAtomicScope Scope,
+                            SIAtomicAddrSpace AddrSpace) const override;
+
   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                       bool IsVolatile,
@@ -408,6 +430,55 @@ class SIGfx7CacheControl : public SIGfx6CacheControl {
 
 };
 
+class SIGfx90ACacheControl : public SIGfx7CacheControl {
+protected:
+
+  /// Sets SCC bit to "true" if present in \p MI. Returns true if \p MI
+  /// is modified, false otherwise.
+  bool enableSCCBit(const MachineBasicBlock::iterator &MI) const {
+    return enableNamedBit<AMDGPU::OpName::sccb>(MI);
+  }
+
+public:
+
+  SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
+
+  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+                             SIAtomicScope Scope,
+                             SIAtomicAddrSpace AddrSpace) const override;
+
+  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
+                              SIAtomicScope Scope,
+                              SIAtomicAddrSpace AddrSpace) const override;
+
+  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
+                            SIAtomicScope Scope,
+                            SIAtomicAddrSpace AddrSpace) const override;
+
+  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+                                      bool IsVolatile,
+                                      bool IsNonTemporal) const override;
+
+  bool insertWait(MachineBasicBlock::iterator &MI,
+                  SIAtomicScope Scope,
+                  SIAtomicAddrSpace AddrSpace,
+                  SIMemOp Op,
+                  bool IsCrossAddrSpaceOrdering,
+                  Position Pos) const override;
+
+  bool insertAcquire(MachineBasicBlock::iterator &MI,
+                     SIAtomicScope Scope,
+                     SIAtomicAddrSpace AddrSpace,
+                     Position Pos) const override;
+
+  bool insertRelease(MachineBasicBlock::iterator &MI,
+                     SIAtomicScope Scope,
+                     SIAtomicAddrSpace AddrSpace,
+                     bool IsCrossAddrSpaceOrdering,
+                     Position Pos) const override;
+};
+
 class SIGfx10CacheControl : public SIGfx7CacheControl {
 protected:
 
@@ -717,6 +788,8 @@ SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
 /* static */
 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
   GCNSubtarget::Generation Generation = ST.getGeneration();
+  if (ST.hasGFX90AInsts())
+    return std::make_unique<SIGfx90ACacheControl>(ST);
   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
     return std::make_unique<SIGfx6CacheControl>(ST);
   if (Generation < AMDGPUSubtarget::GFX10)
@@ -757,6 +830,32 @@ bool SIGfx6CacheControl::enableLoadCacheBypass(
   return Changed;
 }
 
+bool SIGfx6CacheControl::enableStoreCacheBypass(
+    const MachineBasicBlock::iterator &MI,
+    SIAtomicScope Scope,
+    SIAtomicAddrSpace AddrSpace) const {
+  assert(!MI->mayLoad() && MI->mayStore());
+  bool Changed = false;
+
+  /// The L1 cache is write through so does not need to be bypassed. There is no
+  /// bypass control for the L2 cache at the isa level.
+
+  return Changed;
+}
+
+bool SIGfx6CacheControl::enableRMWCacheBypass(
+    const MachineBasicBlock::iterator &MI,
+    SIAtomicScope Scope,
+    SIAtomicAddrSpace AddrSpace) const {
+  assert(MI->mayLoad() && MI->mayStore());
+  bool Changed = false;
+
+  /// The L1 cache is write through so does not need to be bypassed. There is no
+  /// bypass control for the L2 cache at the isa level.
+
+  return Changed;
+}
+
 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
     bool IsVolatile, bool IsNonTemporal) const {
@@ -1000,6 +1099,300 @@ bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
   return Changed;
 }
 
+bool SIGfx90ACacheControl::enableLoadCacheBypass(
+    const MachineBasicBlock::iterator &MI,
+    SIAtomicScope Scope,
+    SIAtomicAddrSpace AddrSpace) const {
+  assert(MI->mayLoad() && !MI->mayStore());
+  bool Changed = false;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+      Changed |= enableSCCBit(MI);
+      Changed |= enableGLCBit(MI);
+      break;
+    case SIAtomicScope::AGENT:
+      Changed |= enableGLCBit(MI);
+      break;
+    case SIAtomicScope::WORKGROUP:
+      // In threadgroup split mode the waves of a work-group can be executing on
+      // 
diff erent CUs. Therefore need to bypass the L1 which is per CU.
+      // Otherwise in non-threadgroup split mode all waves of a work-group are
+      // on the same CU, and so the L1 does not need to be bypassed.
+      if (ST.isTgSplitEnabled()) Changed |= enableGLCBit(MI);
+      break;
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // No cache to bypass.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory caches
+  /// to be bypassed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not have a cache.
+
+  return Changed;
+}
+
+bool SIGfx90ACacheControl::enableStoreCacheBypass(
+    const MachineBasicBlock::iterator &MI,
+    SIAtomicScope Scope,
+    SIAtomicAddrSpace AddrSpace) const {
+  assert(!MI->mayLoad() && MI->mayStore());
+  bool Changed = false;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+      Changed |= enableSCCBit(MI);
+      LLVM_FALLTHROUGH;
+    case SIAtomicScope::AGENT:
+      /// Do not set glc for store atomic operations as they implicitly write
+      /// through the L1 cache.
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // No cache to bypass. Store atomics implicitly write through the L1
+      // cache.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory caches
+  /// to be bypassed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not have a cache.
+
+  return Changed;
+}
+
+bool SIGfx90ACacheControl::enableRMWCacheBypass(
+    const MachineBasicBlock::iterator &MI,
+    SIAtomicScope Scope,
+    SIAtomicAddrSpace AddrSpace) const {
+  assert(MI->mayLoad() && MI->mayStore());
+  bool Changed = false;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+      Changed |= enableSCCBit(MI);
+      LLVM_FALLTHROUGH;
+    case SIAtomicScope::AGENT:
+      /// Do not set glc for RMW atomic operations as they implicitly bypass
+      /// the L1 cache, and the glc bit is instead used to indicate if they are
+      /// return or no-return.
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  return Changed;
+}
+
+bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
+    MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+    bool IsVolatile, bool IsNonTemporal) const {
+  // Only handle load and store, not atomic read-modify-write insructions. The
+  // latter use glc to indicate if the atomic returns a result and so must not
+  // be used for cache control.
+  assert(MI->mayLoad() ^ MI->mayStore());
+
+  // Only update load and store, not LLVM IR atomic read-modify-write
+  // instructions. The latter are always marked as volatile so cannot sensibly
+  // handle it as do not want to pessimize all atomics. Also they do not support
+  // the nontemporal attribute.
+  assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
+
+  bool Changed = false;
+
+  if (IsVolatile) {
+    if (Op == SIMemOp::LOAD) {
+      Changed |= enableGLCBit(MI);
+    }
+    Changed |= enableSCCBit(MI);
+
+    // Ensure operation has completed at system scope to cause all volatile
+    // operations to be visible outside the program in a global order. Do not
+    // request cross address space as only the global address space can be
+    // observable outside the program, so no need to cause a waitcnt for LDS
+    // address space operations.
+    Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
+                          Position::AFTER);
+
+    return Changed;
+  }
+
+  if (IsNonTemporal) {
+    // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
+    Changed |= enableGLCBit(MI);
+    Changed |= enableSLCBit(MI);
+    return Changed;
+  }
+
+  return Changed;
+}
+
+bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
+                                      SIAtomicScope Scope,
+                                      SIAtomicAddrSpace AddrSpace,
+                                      SIMemOp Op,
+                                      bool IsCrossAddrSpaceOrdering,
+                                      Position Pos) const {
+  if (ST.isTgSplitEnabled()) {
+    // In threadgroup split mode the waves of a work-group can be executing on
+    // 
diff erent CUs. Therefore need to wait for global or GDS memory operations
+    // to complete to ensure they are visible to waves in the other CUs.
+    // Otherwise in non-threadgroup split mode all waves of a work-group are on
+    // the same CU, so no need to wait for global memory as all waves in the
+    // work-group access the same the L1, nor wait for GDS as access are ordered
+    // on a CU.
+    if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
+                       SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
+        (Scope == SIAtomicScope::WORKGROUP)) {
+      // Same as GFX7 using agent scope.
+      Scope = SIAtomicScope::AGENT;
+    }
+    // In threadgroup split mode LDS cannot be allocated so no need to wait for
+    // LDS memory operations.
+    AddrSpace &= ~SIAtomicAddrSpace::LDS;
+  }
+  return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
+                                        IsCrossAddrSpaceOrdering, Pos);
+}
+
+bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+                                         SIAtomicScope Scope,
+                                         SIAtomicAddrSpace AddrSpace,
+                                         Position Pos) const {
+  if (!InsertCacheInv)
+    return false;
+
+  bool Changed = false;
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (Pos == Position::AFTER)
+    ++MI;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+      // Ensures that following loads will not see stale remote VMEM data or
+      // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
+      // CC will never be stale due to the local memory probes.
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
+      // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
+      // hardware does not reorder memory operations by the same wave with
+      // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
+      // remove any cache lines of earlier writes by the same wave and ensures
+      // later reads by the same wave will refetch the cache lines.
+      Changed = true;
+      break;
+    case SIAtomicScope::AGENT:
+      // Same as GFX7.
+      break;
+    case SIAtomicScope::WORKGROUP:
+      // In threadgroup split mode the waves of a work-group can be executing on
+      // 
diff erent CUs. Therefore need to invalidate the L1 which is per CU.
+      // Otherwise in non-threadgroup split mode all waves of a work-group are
+      // on the same CU, and so the L1 does not need to be invalidated.
+      if (ST.isTgSplitEnabled()) {
+        // Same as GFX7 using agent scope.
+        Scope = SIAtomicScope::AGENT;
+      }
+      break;
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // Same as GFX7.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory cache
+  /// to be flushed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not have a cache.
+
+  if (Pos == Position::AFTER)
+    --MI;
+
+  Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
+
+  return Changed;
+}
+
+bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
+                                         SIAtomicScope Scope,
+                                         SIAtomicAddrSpace AddrSpace,
+                                         bool IsCrossAddrSpaceOrdering,
+                                         Position Pos) const {
+  bool Changed = false;
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (Pos == Position::AFTER)
+    ++MI;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+      // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
+      // hardware does not reorder memory operations by the same wave with
+      // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
+      // to initiate writeback of any dirty cache lines of earlier writes by the
+      // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
+      // writeback has completed.
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2));
+      // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
+      // vmcnt(0)" needed by the "BUFFER_WBL2".
+      Changed = true;
+      break;
+    case SIAtomicScope::AGENT:
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // Same as GFX7.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  if (Pos == Position::AFTER)
+    --MI;
+
+  Changed |=
+      SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
+                                        IsCrossAddrSpaceOrdering, Pos);
+
+  return Changed;
+}
+
 bool SIGfx10CacheControl::enableLoadCacheBypass(
     const MachineBasicBlock::iterator &MI,
     SIAtomicScope Scope,
@@ -1324,6 +1717,13 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
   bool Changed = false;
 
   if (MOI.isAtomic()) {
+    if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
+        MOI.getOrdering() == AtomicOrdering::Release ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
+      Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
+                                            MOI.getOrderingAddrSpace());
+    }
+
     if (MOI.getOrdering() == AtomicOrdering::Release ||
         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
       Changed |= CC->insertRelease(MI, MOI.getScope(),
@@ -1392,6 +1792,15 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
   bool Changed = false;
 
   if (MOI.isAtomic()) {
+    if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
+        MOI.getOrdering() == AtomicOrdering::Acquire ||
+        MOI.getOrdering() == AtomicOrdering::Release ||
+        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
+      Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
+                                          MOI.getInstrAddrSpace());
+    }
+
     if (MOI.getOrdering() == AtomicOrdering::Release ||
         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||

diff  --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index 9b72d0829d80..b13afceba20e 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -41,10 +41,13 @@ struct SIProgramInfo {
     uint32_t ScratchBlocks = 0;
 
     uint64_t ComputePGMRSrc2 = 0;
+    uint64_t ComputePGMRSrc3GFX90A = 0;
 
     uint32_t NumVGPR = 0;
     uint32_t NumArchVGPR = 0;
     uint32_t NumAccVGPR = 0;
+    uint32_t AccumOffset = 0;
+    uint32_t TgSplit = 0;
     uint32_t NumSGPR = 0;
     uint32_t LDSSize = 0;
     bool FlatUsed = false;

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 44d23ebc4f36..bb2ac725cab9 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -122,7 +122,9 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
   case CallingConv::Fast:
   case CallingConv::Cold:
   case CallingConv::AMDGPU_Gfx:
-    return CSR_AMDGPU_HighRegs_SaveList;
+    return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts()
+        ? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList
+        : CSR_AMDGPU_HighRegs_SaveList;
   default: {
     // Dummy to not crash RegisterClassInfo.
     static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
@@ -143,7 +145,9 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
   case CallingConv::Fast:
   case CallingConv::Cold:
   case CallingConv::AMDGPU_Gfx:
-    return CSR_AMDGPU_HighRegs_RegMask;
+    return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()
+        ? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask
+        : CSR_AMDGPU_HighRegs_RegMask;
   default:
     return nullptr;
   }
@@ -181,6 +185,14 @@ const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
   return CSR_AMDGPU_AllVGPRs_RegMask;
 }
 
+const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const {
+  return CSR_AMDGPU_AllAGPRs_RegMask;
+}
+
+const uint32_t *SIRegisterInfo::getAllVectorRegMask() const {
+  return CSR_AMDGPU_AllVectorRegs_RegMask;
+}
+
 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
   return CSR_AMDGPU_AllAllocatableSRegs_RegMask;
 }
@@ -263,6 +275,12 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   }
 
   unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
+  // TODO: In an entry function without calls and AGPRs used it is possible
+  //       to use the whole register budget for VGPRs. Even more it shall
+  //       be possible to estimate maximum AGPR/VGPR pressure and split
+  //       register file accordingly.
+  if (ST.hasGFX90AInsts())
+    MaxNumVGPRs /= 2;
   unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
   for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
     unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
@@ -327,6 +345,13 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     reserveRegisterTuples(Reserved, Reg);
   }
 
+  if (ST.hasGFX90AInsts())
+    for (const TargetRegisterClass *RC : this->regclasses())
+      if (getRegSizeInBits(*RC) > 32 && hasVectorRegisters(RC))
+        for (unsigned Reg : *RC)
+          if (getEncodingValue(Reg) & 1)
+            Reserved.set(Reg);
+
   // FIXME: Stop using reserved registers for this.
   for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
     reserveRegisterTuples(Reserved, Reg);
@@ -730,6 +755,7 @@ static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
           .addImm(0) // tfe
           .addImm(0) // dlc
           .addImm(0) // swz
+          .addImm(0) // scc
           .cloneMemRefs(*MI);
 
   const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
@@ -798,7 +824,8 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
   MCRegister SOffset = ScratchOffsetReg;
 
   const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
-  const bool IsAGPR = hasAGPRs(RC);
+  // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
+  const bool IsAGPR = !ST.hasGFX90AInsts() && hasAGPRs(RC);
   const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8;
 
   // Always use 4 byte operations for AGPRs because we need to scavenge
@@ -996,6 +1023,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
     if (!IsFlat)
       MIB.addImm(0) // dlc
          .addImm(0); // swz
+    MIB.addImm(0); // scc
     MIB.addMemOperand(NewMMO);
 
     if (!IsAGPR && NeedSuperRegDef)
@@ -2055,6 +2083,13 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
   unsigned DstSize = getRegSizeInBits(*DstRC);
   unsigned NewSize = getRegSizeInBits(*NewRC);
 
+  // Do not allow coalescing between an odd and an even lanes as it will
+  // result in misaligned tuple access.
+  if (ST.hasGFX90AInsts() && !isSGPRClass(NewRC) &&
+      (getChannelFromSubReg(DstSubReg) & 1) !=
+      (getChannelFromSubReg(SubReg) & 1))
+    return false;
+
   // Do not increase size of registers beyond dword, we would need to allocate
   // adjacent registers and constraint regalloc more than needed.
 

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 963da9b3536b..fedac6ab4fcf 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -279,6 +279,8 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
                                 LiveIntervals *LIS) const;
 
   const uint32_t *getAllVGPRRegMask() const;
+  const uint32_t *getAllAGPRRegMask() const;
+  const uint32_t *getAllVectorRegMask() const;
   const uint32_t *getAllAllocatableSRegMask() const;
 
   // \returns number of 32 bit registers covered by a \p LM

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 5163bdab0e8a..816e844be58f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -847,21 +847,36 @@ def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
   let isAllocatable = 0;
 }
 
-def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> {
+def VS_64 : RegisterClass<"AMDGPU", [i64, f64, v2f32], 32, (add VReg_64, SReg_64)> {
   let isAllocatable = 0;
 }
 
-def AV_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def AV_32 : RegisterClass<"AMDGPU", VGPR_32.RegTypes, 32,
                           (add AGPR_32, VGPR_32)> {
   let isAllocatable = 0;
 }
 
-def AV_64 : RegisterClass<"AMDGPU", [i64, f64, v4f16], 32,
+def AV_64 : RegisterClass<"AMDGPU", VReg_64.RegTypes, 32,
                           (add AReg_64, VReg_64)> {
   let isAllocatable = 0;
 }
 } // End GeneratePressureSet = 0
 
+def AV_96 : RegisterClass<"AMDGPU", VReg_96.RegTypes, 32,
+                          (add AReg_96, VReg_96)> {
+  let isAllocatable = 0;
+}
+
+def AV_128 : RegisterClass<"AMDGPU", VReg_128.RegTypes, 32,
+                          (add AReg_128, VReg_128)> {
+  let isAllocatable = 0;
+}
+
+def AV_160 : RegisterClass<"AMDGPU", VReg_160.RegTypes, 32,
+                          (add AReg_160, VReg_160)> {
+  let isAllocatable = 0;
+}
+
 //===----------------------------------------------------------------------===//
 //  Register operands
 //===----------------------------------------------------------------------===//
@@ -912,21 +927,38 @@ multiclass SIRegOperand32 <string rc, string MatchName, string opType,
   }
 }
 
-multiclass SIRegOperand <string rc, string MatchName, string opType> :
-  SIRegOperand32<rc, MatchName, opType> {
+multiclass SIRegOperand64 <string rc, string MatchName, string opType,
+                           string rc_suffix = "_64", bit Vectors = 1> {
   let OperandNamespace = "AMDGPU" in {
-    def _b64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
+    def _b64 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
       let OperandType = opType#"_INT64";
       let ParserMatchClass = RegImmMatcher<MatchName#"B64">;
     }
 
-    def _f64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
+    def _f64 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
       let OperandType = opType#"_FP64";
       let ParserMatchClass = RegImmMatcher<MatchName#"F64">;
     }
+
+    foreach _ = BoolToList<Vectors>.ret in
+    def _v2f32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
+      let OperandType = opType#"_V2FP32";
+      let ParserMatchClass = RegImmMatcher<MatchName#"V2FP32">;
+      let DecoderMethod = "decodeOperand_VSrcV232";
+    }
+    foreach _ = BoolToList<Vectors>.ret in
+    def _v2b32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
+      let OperandType = opType#"_V2INT32";
+      let ParserMatchClass = RegImmMatcher<MatchName#"V2INT32">;
+      let DecoderMethod = "decodeOperand_VSrcV232";
+    }
   }
 }
 
+multiclass SIRegOperand <string rc, string MatchName, string opType> :
+  SIRegOperand32<rc, MatchName, opType>,
+  SIRegOperand64<rc, MatchName, opType>;
+
 // FIXME: 64-bit sources can sometimes use 32-bit constants.
 multiclass RegImmOperand <string rc, string MatchName>
   : SIRegOperand<rc, MatchName, "OPERAND_REG_IMM">;
@@ -938,10 +970,18 @@ multiclass RegInlineOperand32 <string rc, string MatchName,
                                string rc_suffix = "_32">
   : SIRegOperand32<rc, MatchName, "OPERAND_REG_INLINE_C", rc_suffix>;
 
+multiclass RegInlineOperand64 <string rc, string MatchName,
+                               string rc_suffix = "_64">
+  : SIRegOperand64<rc, MatchName, "OPERAND_REG_INLINE_C", rc_suffix>;
+
 multiclass RegInlineOperandAC <string rc, string MatchName,
                                string rc_suffix = "_32">
   : SIRegOperand32<rc, MatchName, "OPERAND_REG_INLINE_AC", rc_suffix>;
 
+multiclass RegInlineOperandAC64 <string rc, string MatchName,
+                                 string rc_suffix = "_64">
+  : SIRegOperand64<rc, MatchName, "OPERAND_REG_INLINE_AC", rc_suffix, 0>;
+
 //===----------------------------------------------------------------------===//
 //  SSrc_* Operands with an SGPR or a 32-bit immediate
 //===----------------------------------------------------------------------===//
@@ -1001,6 +1041,13 @@ defm VCSrc : RegInlineOperand<"VS", "VCSrc">;
 //===----------------------------------------------------------------------===//
 
 defm VISrc : RegInlineOperand32<"VGPR", "VISrc">;
+let DecoderMethod = "decodeOperand_VReg_64" in
+defm VISrc_64   : RegInlineOperand64<"VReg", "VISrc_64",   "_64">;
+defm VISrc_128  : RegInlineOperandAC<"VReg", "VISrc_128",  "_128">;
+let DecoderMethod = "decodeOperand_VReg_256" in
+defm VISrc_256  : RegInlineOperand64<"VReg", "VISrc_256",  "_256">;
+defm VISrc_512  : RegInlineOperandAC<"VReg", "VISrc_512",  "_512">;
+defm VISrc_1024 : RegInlineOperandAC<"VReg", "VISrc_1024", "_1024">;
 
 //===----------------------------------------------------------------------===//
 //  AVSrc_* Operands with an AGPR or VGPR
@@ -1016,6 +1063,31 @@ def AVSrc_64 : RegisterOperand<AV_64> {
   let EncoderMethod = "getAVOperandEncoding";
 }
 
+def AVLdSt_32 : RegisterOperand<AV_32> {
+  let DecoderMethod = "DecodeAVLdSt_32RegisterClass";
+  let EncoderMethod = "getAVOperandEncoding";
+}
+
+def AVLdSt_64 : RegisterOperand<AV_64> {
+  let DecoderMethod = "DecodeAVLdSt_64RegisterClass";
+  let EncoderMethod = "getAVOperandEncoding";
+}
+
+def AVLdSt_96 : RegisterOperand<AV_96> {
+  let DecoderMethod = "DecodeAVLdSt_96RegisterClass";
+  let EncoderMethod = "getAVOperandEncoding";
+}
+
+def AVLdSt_128 : RegisterOperand<AV_128> {
+  let DecoderMethod = "DecodeAVLdSt_128RegisterClass";
+  let EncoderMethod = "getAVOperandEncoding";
+}
+
+def AVLdSt_160 : RegisterOperand<AV_160> {
+  let DecoderMethod = "DecodeAVLdSt_160RegisterClass";
+  let EncoderMethod = "getAVOperandEncoding";
+}
+
 //===----------------------------------------------------------------------===//
 //  ACSrc_* Operands with an AGPR or an inline constant
 //===----------------------------------------------------------------------===//
@@ -1024,3 +1096,8 @@ defm AISrc      : RegInlineOperandAC<"AGPR", "AISrc">;
 defm AISrc_128  : RegInlineOperandAC<"AReg", "AISrc_128",  "_128">;
 defm AISrc_512  : RegInlineOperandAC<"AReg", "AISrc_512",  "_512">;
 defm AISrc_1024 : RegInlineOperandAC<"AReg", "AISrc_1024", "_1024">;
+
+let DecoderMethod = "decodeOperand_AReg_64" in
+defm AISrc_64   : RegInlineOperandAC64<"AReg", "AISrc_64",   "_64">;
+let DecoderMethod = "decodeOperand_AReg_256" in
+defm AISrc_256  : RegInlineOperandAC64<"AReg", "AISrc_256",  "_256">;

diff  --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index db4a009e08d7..adb79dd93d25 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -54,10 +54,15 @@ def WriteTrans64    : SchedWrite;
 // Half rate 64-bit instructions.
 def Write64Bit : SchedWrite;
 
+// Integer multiplications.
+def WriteIntMul : SchedWrite;
+
 // mAI multipass instructions.
 def Write2PassMAI  : SchedWrite;
 def Write8PassMAI  : SchedWrite;
 def Write16PassMAI : SchedWrite;
+def Write4PassDGEMM : SchedWrite;
+def Write8PassDGEMM : SchedWrite;
 
 // FIXME: Should there be a class for instructions which are VALU
 // instructions and have VALU rates, but write to the SALU (i.e. VOPC
@@ -80,6 +85,7 @@ class SISchedMachineModel : SchedMachineModel {
 
 def SIFullSpeedModel : SISchedMachineModel;
 def SIQuarterSpeedModel : SISchedMachineModel;
+def SIDPFullSpeedModel : SISchedMachineModel;
 def GFX10SpeedModel : SISchedMachineModel;
 
 // XXX: Are the resource counts correct?
@@ -137,11 +143,13 @@ multiclass SICommonWriteRes {
   def : HWWriteRes<WriteBarrier, [HWBranch], 500>; // XXX: Guessed ???
 
   def : HWVALUWriteRes<Write32Bit,         1>;
-  def : HWVALUWriteRes<Write64Bit,         2>;
   def : HWVALUWriteRes<WriteFloatCvt,      4>;
   def : HWVALUWriteRes<WriteTrans32,       4>;
   def : HWVALUWriteRes<WriteQuarterRate32, 4>;
 
+  def : HWVALUWriteRes<Write4PassDGEMM,    4>;
+  def : HWVALUWriteRes<Write8PassDGEMM,   16>;
+
   let ResourceCycles = [2] in
   def : HWWriteRes<Write2PassMAI,  [HWXDL], 2>;
   let ResourceCycles = [8] in
@@ -150,7 +158,6 @@ multiclass SICommonWriteRes {
   def : HWWriteRes<Write16PassMAI, [HWXDL], 16>;
 
   def : ReadAdvance<MIVGPRRead, -2>;
-  def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
 
   // Technically mfma reads can be from 0 to 4 cycles but that does not make
   // sense to model because its register setup is huge. In particular if we
@@ -159,10 +166,6 @@ multiclass SICommonWriteRes {
   // need to consume 2 or 4 more vgprs to be initialized before the acc
   // write sequence. Just assume worst case here.
   def : ReadAdvance<MIMFMARead, -4>;
-
-  def : InstRW<[Write2PassMAI,  MIMFMARead], (instregex "^V_MFMA_..._4X4X")>;
-  def : InstRW<[Write8PassMAI,  MIMFMARead], (instregex "^V_MFMA_..._16X16X")>;
-  def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>;
 }
 
 def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>;
@@ -176,11 +179,13 @@ let SchedModel = SIFullSpeedModel in {
 
 defm : SICommonWriteRes;
 
-def : HWVALUWriteRes<WriteFloatFMA,   1>;
-def : HWVALUWriteRes<WriteDouble,     4>;
-def : HWVALUWriteRes<WriteDoubleAdd,  2>;
-def : HWVALUWriteRes<WriteDoubleCvt,  4>;
-def : HWVALUWriteRes<WriteTrans64,    4>;
+def : HWVALUWriteRes<Write64Bit,       2>;
+def : HWVALUWriteRes<WriteIntMul,      4>;
+def : HWVALUWriteRes<WriteFloatFMA,    1>;
+def : HWVALUWriteRes<WriteDouble,      4>;
+def : HWVALUWriteRes<WriteDoubleAdd,   2>;
+def : HWVALUWriteRes<WriteDoubleCvt,   4>;
+def : HWVALUWriteRes<WriteTrans64,     4>;
 
 def : InstRW<[WriteCopy], (instrs COPY)>;
 
@@ -190,16 +195,44 @@ let SchedModel = SIQuarterSpeedModel in {
 
 defm : SICommonWriteRes;
 
-def : HWVALUWriteRes<WriteFloatFMA, 16>;
-def : HWVALUWriteRes<WriteDouble,   16>;
-def : HWVALUWriteRes<WriteDoubleAdd, 8>;
-def : HWVALUWriteRes<WriteDoubleCvt, 4>;
-def : HWVALUWriteRes<WriteTrans64,  16>;
+def : HWVALUWriteRes<Write64Bit,       2>;
+def : HWVALUWriteRes<WriteIntMul,      4>;
+def : HWVALUWriteRes<WriteFloatFMA,    16>;
+def : HWVALUWriteRes<WriteDouble,      16>;
+def : HWVALUWriteRes<WriteDoubleAdd,    8>;
+def : HWVALUWriteRes<WriteDoubleCvt,    4>;
+def : HWVALUWriteRes<WriteTrans64,     16>;
 
 def : InstRW<[WriteCopy], (instrs COPY)>;
+def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
+def : InstRW<[Write2PassMAI,  MIMFMARead], (instregex "^V_MFMA_..._4X4X")>;
+def : InstRW<[Write8PassMAI,  MIMFMARead], (instregex "^V_MFMA_..._16X16X")>;
+def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>;
 
 }  // End SchedModel = SIQuarterSpeedModel
 
+let SchedModel = SIDPFullSpeedModel in {
+
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA,    1>;
+def : HWVALUWriteRes<WriteDouble,      1>;
+def : HWVALUWriteRes<WriteDoubleAdd,   1>;
+def : HWVALUWriteRes<WriteDoubleCvt,   1>;
+def : HWVALUWriteRes<WriteTrans64,     4>;
+def : HWVALUWriteRes<WriteIntMul,      1>;
+def : HWVALUWriteRes<Write64Bit,       1>;
+
+def : InstRW<[WriteCopy], (instrs COPY)>;
+def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
+def : InstRW<[Write2PassMAI,   MIMFMARead], (instregex "^V_MFMA_.32_4X4X")>;
+def : InstRW<[Write8PassMAI,   MIMFMARead], (instregex "^V_MFMA_.32_16X16X")>;
+def : InstRW<[Write16PassMAI,  MIMFMARead], (instregex "^V_MFMA_.32_32X32X")>;
+def : InstRW<[Write4PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_4X4X")>;
+def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>;
+
+} // End SchedModel = SIDPFullSpeedModel
+
 let SchedModel = GFX10SpeedModel in {
 
 // The latency values are 1 / (operations / cycle).
@@ -213,6 +246,7 @@ def : HWWriteRes<WriteFloatFMA,      [HWVALU, HWRC],   5>;
 def : HWWriteRes<WriteDouble,        [HWVALU, HWRC],   22>;
 def : HWWriteRes<WriteDoubleAdd,     [HWVALU, HWRC],   22>;
 def : HWWriteRes<WriteDoubleCvt,     [HWVALU, HWRC],   22>;
+def : HWWriteRes<WriteIntMul,        [HWVALU, HWRC],   8>;
 def : HWWriteRes<WriteTrans64,       [HWVALU, HWRC],   24>;
 
 def : HWWriteRes<WriteBranch,        [HWBranch],       32>;

diff  --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 1deef5080318..7c62282012f9 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -407,6 +407,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
       char Flags = 0;
 
       if (TII->isWQM(Opcode)) {
+        // If LOD is not supported WQM is not needed.
+        if (!ST->hasExtendedImageInsts())
+          continue;
         // Sampling instructions don't need to produce results for all pixels
         // in a quad, they just require all inputs of a quad to have been
         // computed for derivatives.

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 4c1e4dec7ecb..f384719144e5 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -345,7 +345,7 @@ void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
          << "gfx"
          << Version.Major
          << Version.Minor
-         << Version.Stepping;
+         << hexdigit(Version.Stepping, true);
 
   if (hasXNACK(*STI))
     Stream << "+xnack";
@@ -402,6 +402,8 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) {
 
 unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) {
   // FIXME: Need to take scratch memory into account.
+  if (isGFX90A(*STI))
+    return 8;
   if (!isGFX10Plus(*STI))
     return 10;
   return hasGFX10_3Insts(*STI) ? 16 : 20;
@@ -531,6 +533,9 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) {
 
 unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
                              Optional<bool> EnableWavefrontSize32) {
+  if (STI->getFeatureBits().test(FeatureGFX90AInsts))
+    return 8;
+
   bool IsWave32 = EnableWavefrontSize32 ?
       *EnableWavefrontSize32 :
       STI->getFeatureBits().test(FeatureWavefrontSize32);
@@ -543,6 +548,8 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
 
 unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
                                 Optional<bool> EnableWavefrontSize32) {
+  if (STI->getFeatureBits().test(FeatureGFX90AInsts))
+    return 8;
 
   bool IsWave32 = EnableWavefrontSize32 ?
       *EnableWavefrontSize32 :
@@ -552,12 +559,16 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
 }
 
 unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
+  if (STI->getFeatureBits().test(FeatureGFX90AInsts))
+    return 512;
   if (!isGFX10Plus(*STI))
     return 256;
   return STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1024 : 512;
 }
 
 unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
+  if (STI->getFeatureBits().test(FeatureGFX90AInsts))
+    return 512;
   return 256;
 }
 
@@ -653,6 +664,11 @@ amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(
     AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
                     amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED, 1);
   }
+  if (AMDGPU::isGFX90A(*STI)) {
+    AMDHSA_BITS_SET(KD.compute_pgm_rsrc3,
+                    amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
+                    STI->getFeatureBits().test(FeatureTgSplit) ? 1 : 0);
+  }
   return KD;
 }
 
@@ -1267,6 +1283,10 @@ bool hasGFX10_3Insts(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureGFX10_3Insts];
 }
 
+bool isGFX90A(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts];
+}
+
 bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
   const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
   const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0);
@@ -1374,6 +1394,9 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
   case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
   case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
   case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+  case AMDGPU::OPERAND_REG_IMM_V2FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
+  case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
     return true;
   default:
     return false;
@@ -1418,16 +1441,19 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::SReg_96RegClassID:
   case AMDGPU::VReg_96RegClassID:
   case AMDGPU::AReg_96RegClassID:
+  case AMDGPU::AV_96RegClassID:
     return 96;
   case AMDGPU::SGPR_128RegClassID:
   case AMDGPU::SReg_128RegClassID:
   case AMDGPU::VReg_128RegClassID:
   case AMDGPU::AReg_128RegClassID:
+  case AMDGPU::AV_128RegClassID:
     return 128;
   case AMDGPU::SGPR_160RegClassID:
   case AMDGPU::SReg_160RegClassID:
   case AMDGPU::VReg_160RegClassID:
   case AMDGPU::AReg_160RegClassID:
+  case AMDGPU::AV_160RegClassID:
     return 160;
   case AMDGPU::SGPR_192RegClassID:
   case AMDGPU::SReg_192RegClassID:

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f9378693cf48..246132ef27ec 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -703,6 +703,7 @@ bool isGFX10Plus(const MCSubtargetInfo &STI);
 bool isGCN3Encoding(const MCSubtargetInfo &STI);
 bool isGFX10_BEncoding(const MCSubtargetInfo &STI);
 bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
+bool isGFX90A(const MCSubtargetInfo &STI);
 
 /// Is Reg - scalar register
 bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
@@ -746,12 +747,17 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
   case AMDGPU::OPERAND_REG_INLINE_C_FP32:
   case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
   case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+  case AMDGPU::OPERAND_REG_IMM_V2INT32:
+  case AMDGPU::OPERAND_REG_IMM_V2FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
     return 4;
 
   case AMDGPU::OPERAND_REG_IMM_INT64:
   case AMDGPU::OPERAND_REG_IMM_FP64:
   case AMDGPU::OPERAND_REG_INLINE_C_INT64:
   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
     return 8;
 
   case AMDGPU::OPERAND_REG_IMM_INT16:
@@ -847,6 +853,11 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
                       const GCNSubtarget *Subtarget,
                       Align Alignment = Align(4));
 
+LLVM_READNONE
+inline bool isLegal64BitDPPControl(unsigned DC) {
+  return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST;
+}
+
 /// \returns true if the intrinsic is divergent
 bool isIntrinsicSourceOfDivergence(unsigned IntrID);
 

diff  --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index f1e470031982..5ba0723ba24b 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -351,12 +351,12 @@ let SubtargetPredicate = isGFX6GFX7 in {
       VOP1Inst<"v_rsq_legacy_f32", VOP_F32_F32, int_amdgcn_rsq_legacy>;
   } // End TRANS = 1, SchedRW = [WriteTrans32]
 
-  let SchedRW = [WriteDouble] in {
+  let SchedRW = [WriteTrans64] in {
     defm V_RCP_CLAMP_F64 :
       VOP1Inst<"v_rcp_clamp_f64", VOP_F64_F64>;
     defm V_RSQ_CLAMP_F64 :
       VOP1Inst<"v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp>;
-  } // End SchedRW = [WriteDouble]
+  } // End SchedRW = [WriteTrans64]
 } // End SubtargetPredicate = isGFX6GFX7
 
 let SubtargetPredicate = isGFX7GFX8GFX9 in {
@@ -461,6 +461,18 @@ let SubtargetPredicate = isGFX10Plus in {
   } // End Uses = [M0]
 } // End SubtargetPredicate = isGFX10Plus
 
+def VOPProfileAccMov : VOP_NO_EXT<VOP_I32_I32> {
+  let DstRC = RegisterOperand<AGPR_32>;
+  let Src0RC32 = RegisterOperand<AGPR_32>;
+  let Asm32 = " $vdst, $src0";
+}
+
+def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1> {
+  let SubtargetPredicate = isGFX90APlus;
+  let isReMaterializable = 1;
+  let isAsCheapAsAMove = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Target-specific instruction encodings.
 //===----------------------------------------------------------------------===//
@@ -823,6 +835,8 @@ defm V_SAT_PK_U8_I16     : VOP1_Real_vi<0x4f>;
 defm V_CVT_NORM_I16_F16  : VOP1_Real_vi<0x4d>;
 defm V_CVT_NORM_U16_F16  : VOP1_Real_vi<0x4e>;
 
+defm V_ACCVGPR_MOV_B32   : VOP1Only_Real_vi<0x52>;
+
 // Copy of v_mov_b32 with $vdst as a use operand for use with VGPR
 // indexing mode. vdst can't be treated as a def for codegen purposes,
 // and an implicit use and def of the super register should be added.

diff  --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 7a334eaadaed..edcd8595f9e1 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -289,28 +289,30 @@ class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
 def VOP_MADMK_F16 : VOP_MADMK <f16>;
 def VOP_MADMK_F32 : VOP_MADMK <f32>;
 
+class getRegisterOperandForVT<ValueType VT> {
+  RegisterOperand ret = RegisterOperand<getVregSrcForVT<VT>.ret>;
+}
+
 // FIXME: Remove src2_modifiers. It isn't used, so is wasting memory
 // and processing time but it makes it easier to convert to mad.
 class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, vt0]> {
-  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
-  let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
+  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT<Src2VT>.ret:$src2);
+  let Ins64 = getIns64<Src0RC64, Src1RC64, getRegisterOperandForVT<Src2VT>.ret, 3,
                        0, HasModifiers, HasModifiers, HasOMod,
                        Src0Mod, Src1Mod, Src2Mod>.ret;
   let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
                     Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
-                    VGPR_32:$src2, // stub argument
+                    getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
   let InsDPP16 = !con(InsDPP, (ins FI:$fi));
-
   let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
                      Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
-                     VGPR_32:$src2, // stub argument
+                     getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
                      dpp8:$dpp8, FI:$fi);
-
   let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
                      Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
-                     VGPR_32:$src2, // stub argument
+                     getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
                      clampmod:$clamp, omod:$omod,
                      dst_sel:$dst_sel, dst_unused:$dst_unused,
                      src0_sel:$src0_sel, src1_sel:$src1_sel);
@@ -335,6 +337,8 @@ def VOP_MAC_F16 : VOP_MAC <f16>;
 def VOP_MAC_F32 : VOP_MAC <f32>;
 let HasExtDPP = 0 in
 def VOP_MAC_LEGACY_F32 : VOP_MAC <f32>;
+let HasExtSDWA = 0, HasExt64BitDPP = 1 in
+def VOP_MAC_F64 : VOP_MAC <f64>;
 
 class VOP_DOT_ACC<ValueType vt0, ValueType vt1> : VOP_MAC<vt0, vt1> {
   let HasClamp = 0;
@@ -448,6 +452,7 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
 
   let HasExt = 0;
   let HasExtDPP = 0;
+  let HasExt64BitDPP = 0;
   let HasExtSDWA = 0;
   let HasExtSDWA9 = 0;
 }
@@ -464,6 +469,7 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
 
   let HasExt = 0;
   let HasExtDPP = 0;
+  let HasExt64BitDPP = 0;
   let HasExtSDWA = 0;
   let HasExtSDWA9 = 0;
 }
@@ -692,6 +698,14 @@ defm V_FMAC_LEGACY_F32 : VOP2Inst <"v_fmac_legacy_f32", VOP_MAC_LEGACY_F32>;
 
 } // End SubtargetPredicate = HasFmaLegacy32
 
+let SubtargetPredicate = isGFX90APlus,
+    Constraints = "$vdst = $src2",
+    DisableEncoding="$src2",
+    isConvertibleToThreeAddress = 1,
+    isCommutable = 1,
+    SchedRW = [WriteDoubleAdd] in
+defm V_FMAC_F64 : VOP2Inst <"v_fmac_f64", VOP_MAC_F64>;
+
 let Constraints = "$vdst = $src2",
       DisableEncoding="$src2",
       isConvertibleToThreeAddress = 1,
@@ -1525,6 +1539,7 @@ defm V_CNDMASK_B32        : VOP2_Real_e32e64_vi <0x0>;
 defm V_ADD_F32            : VOP2_Real_e32e64_vi <0x1>;
 defm V_SUB_F32            : VOP2_Real_e32e64_vi <0x2>;
 defm V_SUBREV_F32         : VOP2_Real_e32e64_vi <0x3>;
+let AssemblerPredicate = isGCN3ExcludingGFX90A in
 defm V_MUL_LEGACY_F32     : VOP2_Real_e32e64_vi <0x4>;
 defm V_MUL_F32            : VOP2_Real_e32e64_vi <0x5>;
 defm V_MUL_I32_I24        : VOP2_Real_e32e64_vi <0x6>;
@@ -1641,6 +1656,40 @@ defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>;
 
 } // End SubtargetPredicate = HasDLInsts
 
+let AssemblerPredicate = isGFX90APlus, DecoderNamespace = "GFX90A" in {
+  multiclass VOP2_Real_e32_gfx90a <bits<6> op> {
+    def _e32_gfx90a :
+      VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX90A>,
+      VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
+  }
+
+  multiclass VOP2_Real_e64_gfx90a <bits<10> op> {
+    def _e64_gfx90a :
+      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX90A>,
+      VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+  }
+
+  multiclass Base_VOP2_Real_e32e64_gfx90a <bits<6> op> :
+    VOP2_Real_e32_gfx90a<op>,
+    VOP2_Real_e64_gfx90a<{0, 1, 0, 0, op{5-0}}>;
+
+  multiclass VOP2_Real_e32e64_gfx90a <bits<6> op> :
+    Base_VOP2_Real_e32e64_gfx90a<op> {
+
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+      def _dpp_gfx90a :
+        VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX90A>,
+        VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> {
+          let DecoderNamespace = "SDWA9";
+        }
+  }
+} // End AssemblerPredicate = isGFX90APlus, DecoderNamespace = "GFX90A"
+
+let SubtargetPredicate = isGFX90APlus in {
+  defm V_FMAC_F64       : VOP2_Real_e32e64_gfx90a <0x4>;
+  defm V_MUL_LEGACY_F32 : VOP2_Real_e64_gfx90a <0x2a1>;
+} // End SubtargetPredicate = isGFX90APlus
+
 multiclass VOP2_Real_DOT_ACC_gfx9<bits<6> op> : VOP2_Real_e32_vi<op> {
   def _dpp_vi : VOP2_DPP<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
 }

diff  --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 42dc995609f0..6fd9ae235b35 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -325,12 +325,12 @@ defm V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_l
 defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like, 1>;
 } // End SchedRW = [WriteDoubleAdd]
 
-let SchedRW = [WriteQuarterRate32] in {
+let SchedRW = [WriteIntMul] in {
 defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>, mul>;
 defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>;
 defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
 defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;
-} // End SchedRW = [WriteQuarterRate32]
+} // End SchedRW = [WriteIntMul]
 
 let Uses = [MODE, VCC, EXEC] in {
 // v_div_fmas_f32:
@@ -447,10 +447,10 @@ defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32
 } // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32]
 
 let isCommutable = 1 in {
-let SchedRW = [WriteQuarterRate32, WriteSALU] in {
+let SchedRW = [WriteIntMul, WriteSALU] in {
 defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
 defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
-} // End SchedRW = [WriteQuarterRate32, WriteSALU]
+} // End SchedRW = [WriteIntMul, WriteSALU]
 } // End isCommutable = 1
 
 } // End SubtargetPredicate = isGFX7Plus
@@ -476,6 +476,7 @@ let renamedInGFX9 = 1 in {
   let FPDPRounding = 1 in {
     defm V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
     let Uses = [MODE, M0, EXEC] in {
+    let OtherPredicates = [isNotGFX90APlus] in
     // For some reason the intrinsic operands are in a 
diff erent order
     // from the instruction operands.
     def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>,
@@ -497,24 +498,24 @@ let SubtargetPredicate = isGFX9Only, FPDPRounding = 1 in {
 let SubtargetPredicate = isGFX9Plus in {
 defm V_MAD_U16_gfx9   : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
 defm V_MAD_I16_gfx9   : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
+let OtherPredicates = [isNotGFX90APlus] in
 def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
 } // End SubtargetPredicate = isGFX9Plus
 
-let Uses = [MODE, M0, EXEC], FPDPRounding = 1 in {
+// This predicate should only apply to the selection pattern. The
+// instruction still exists and should decode on subtargets with
+// other bank counts.
+let OtherPredicates = [isNotGFX90APlus, has32BankLDS], Uses = [MODE, M0, EXEC], FPDPRounding = 1 in {
 def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>,
        [(set f32:$vdst, (int_amdgcn_interp_p1_f16 (VOP3Mods f32:$src0, i32:$src0_modifiers),
                                                   (i32 timm:$attrchan),
                                                   (i32 timm:$attr),
-                                                  (i1 timm:$high), M0))]> {
-  // This predicate should only apply to the selection pattern. The
-  // instruction still exists and should decode on subtargets with
-  // other bank counts.
-  let OtherPredicates = [has32BankLDS];
-}
-
+                                                  (i1 timm:$high), M0))]>;
+} // End OtherPredicates = [isNotGFX90APlus, has32BankLDS], Uses = [MODE, M0, EXEC], FPDPRounding = 1
 
+let OtherPredicates = [isNotGFX90APlus], Uses = [MODE, M0, EXEC], FPDPRounding = 1 in {
 def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
-} // End Uses = [MODE, M0, EXEC], FPDPRounding = 1
+} // End OtherPredicates = [isNotGFX90APlus], Uses = [MODE, M0, EXEC], FPDPRounding = 1
 
 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
 
@@ -527,11 +528,11 @@ def : GCNPat<
       ), VGPR_32)), sub1)
 >;
 
-let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC] in {
+let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC], OtherPredicates = [isNotGFX90APlus] in {
 def V_INTERP_P1_F32_e64  : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
 def V_INTERP_P2_F32_e64  : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
 def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;
-} // End SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC]
+} // End SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC], OtherPredicates = [isNotGFX90APlus]
 
 let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in {
 

diff  --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index b06addf4c927..073e4dff4578 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -340,9 +340,16 @@ foreach Type = ["U", "I"] in
     (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
 
 def ADst_32   : VOPDstOperand<AGPR_32>;
+def ADst_64   : VOPDstOperand<AReg_64>;
 def ADst_128  : VOPDstOperand<AReg_128>;
+def ADst_256  : VOPDstOperand<AReg_256>;
 def ADst_512  : VOPDstOperand<AReg_512>;
 def ADst_1024 : VOPDstOperand<AReg_1024>;
+def VDst_64   : VOPDstOperand<VReg_64>;
+def VDst_128  : VOPDstOperand<VReg_128>;
+def VDst_256  : VOPDstOperand<VReg_256>;
+def VDst_512  : VOPDstOperand<VReg_512>;
+def VDst_1024 : VOPDstOperand<VReg_1024>;
 
 def VOPProfileAccRead : VOP3_Profile<VOP_I32_I32, VOP3_MAI> {
   let Src0RC64 = ARegSrc_32;
@@ -362,6 +369,9 @@ class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC
   let Src2RC64 = _SrcRC;
   let HasOpSel = 0;
   let HasClamp = 0;
+  let HasIntClamp = 0;
+  let HasOMod = 0;
+  let HasModifiers = 0;
   let Asm64 = " $vdst, $src0, $src1, $src2$cbsz$abid$blgp";
   let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp);
 }
@@ -378,6 +388,29 @@ def VOPProfileMAI_F32_V2I16_X32 : VOPProfileMAI<VOP_V32F32_V2I16_V2I16_V32F32, A
 def VOPProfileMAI_F32_V4F16_X4  : VOPProfileMAI<VOP_V4F32_V4F16_V4F16_V4F32,   AISrc_128_b32,  ADst_128,  AVSrc_64>;
 def VOPProfileMAI_F32_V4F16_X16 : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, AISrc_512_b32,  ADst_512,  AVSrc_64>;
 def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>;
+def VOPProfileMAI_F32_V4I16_X4  : VOPProfileMAI<VOP_V4F32_V4I16_V4I16_V4F32,   AISrc_128_b32,  ADst_128,  AVSrc_64>;
+def VOPProfileMAI_F32_V4I16_X16 : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F32, AISrc_512_b32,  ADst_512,  AVSrc_64>;
+def VOPProfileMAI_F32_V4I16_X32 : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>;
+def VOPProfileMAI_F64_16X16X4F64 : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64,      AISrc_256_f64,  ADst_256,  AVSrc_64>;
+def VOPProfileMAI_F64_4X4X4F64   : VOPProfileMAI<VOP_F64_F64_F64_F64,          AISrc_64_f64,   ADst_64,   AVSrc_64>;
+
+def VOPProfileMAI_F32_F32_X4_VCD     : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32,       VISrc_128_f32,  VDst_128>;
+def VOPProfileMAI_F32_F32_X16_VCD    : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32,     VISrc_512_f32,  VDst_512>;
+def VOPProfileMAI_F32_F32_X32_VCD    : VOPProfileMAI<VOP_V32F32_F32_F32_V32F32,     VISrc_1024_f32, VDst_1024>;
+def VOPProfileMAI_I32_I32_X4_VCD     : VOPProfileMAI<VOP_V4I32_I32_I32_V4I32,       VISrc_128_b32,  VDst_128>;
+def VOPProfileMAI_I32_I32_X16_VCD    : VOPProfileMAI<VOP_V16I32_I32_I32_V16I32,     VISrc_512_b32,  VDst_512>;
+def VOPProfileMAI_I32_I32_X32_VCD    : VOPProfileMAI<VOP_V32I32_I32_I32_V32I32,     VISrc_1024_b32, VDst_1024>;
+def VOPProfileMAI_F32_V2I16_X4_VCD   : VOPProfileMAI<VOP_V4F32_V2I16_V2I16_V4F32,   VISrc_128_b32,  VDst_128>;
+def VOPProfileMAI_F32_V2I16_X16_VCD  : VOPProfileMAI<VOP_V16F32_V2I16_V2I16_V16F32, VISrc_512_b32,  VDst_512>;
+def VOPProfileMAI_F32_V2I16_X32_VCD  : VOPProfileMAI<VOP_V32F32_V2I16_V2I16_V32F32, VISrc_1024_b32, VDst_1024>;
+def VOPProfileMAI_F32_V4F16_X4_VCD   : VOPProfileMAI<VOP_V4F32_V4F16_V4F16_V4F32,   VISrc_128_b32,  VDst_128,  AVSrc_64>;
+def VOPProfileMAI_F32_V4F16_X16_VCD  : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, VISrc_512_b32,  VDst_512,  AVSrc_64>;
+def VOPProfileMAI_F32_V4F16_X32_VCD  : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>;
+def VOPProfileMAI_F32_V4I16_X4_VCD   : VOPProfileMAI<VOP_V4F32_V4I16_V4I16_V4F32,   VISrc_128_b32,  VDst_128,  AVSrc_64>;
+def VOPProfileMAI_F32_V4I16_X16_VCD  : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F32, VISrc_512_b32,  VDst_512,  AVSrc_64>;
+def VOPProfileMAI_F32_V4I16_X32_VCD  : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>;
+def VOPProfileMAI_F64_16X16X4F64_VCD : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64,       VISrc_256_f64,  VDst_256,  AVSrc_64>;
+def VOPProfileMAI_F64_4X4X4F64_VCD   : VOPProfileMAI<VOP_F64_F64_F64_F64,           VISrc_64_f64,   VDst_64,   AVSrc_64>;
 
 let Predicates = [HasMAIInsts] in {
 
@@ -388,32 +421,57 @@ let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
   } // End isMoveImm = 1
 } // End isAsCheapAsAMove = 1, isReMaterializable = 1
 
-// FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
-let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
-defm V_MFMA_F32_4X4X1F32    : VOP3Inst<"v_mfma_f32_4x4x1f32",    VOPProfileMAI_F32_F32_X4,    int_amdgcn_mfma_f32_4x4x1f32>;
-defm V_MFMA_F32_4X4X4F16    : VOP3Inst<"v_mfma_f32_4x4x4f16",    VOPProfileMAI_F32_V4F16_X4,  int_amdgcn_mfma_f32_4x4x4f16>;
-defm V_MFMA_I32_4X4X4I8     : VOP3Inst<"v_mfma_i32_4x4x4i8",     VOPProfileMAI_I32_I32_X4,    int_amdgcn_mfma_i32_4x4x4i8>;
-defm V_MFMA_F32_4X4X2BF16   : VOP3Inst<"v_mfma_f32_4x4x2bf16",   VOPProfileMAI_F32_V2I16_X4,  int_amdgcn_mfma_f32_4x4x2bf16>;
-defm V_MFMA_F32_16X16X1F32  : VOP3Inst<"v_mfma_f32_16x16x1f32",  VOPProfileMAI_F32_F32_X16,   int_amdgcn_mfma_f32_16x16x1f32>;
-defm V_MFMA_F32_16X16X4F32  : VOP3Inst<"v_mfma_f32_16x16x4f32",  VOPProfileMAI_F32_F32_X4,    int_amdgcn_mfma_f32_16x16x4f32>;
-defm V_MFMA_F32_16X16X4F16  : VOP3Inst<"v_mfma_f32_16x16x4f16",  VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_16x16x4f16>;
-defm V_MFMA_F32_16X16X16F16 : VOP3Inst<"v_mfma_f32_16x16x16f16", VOPProfileMAI_F32_V4F16_X4,  int_amdgcn_mfma_f32_16x16x16f16>;
-defm V_MFMA_I32_16X16X4I8   : VOP3Inst<"v_mfma_i32_16x16x4i8",   VOPProfileMAI_I32_I32_X16,   int_amdgcn_mfma_i32_16x16x4i8>;
-defm V_MFMA_I32_16X16X16I8  : VOP3Inst<"v_mfma_i32_16x16x16i8",  VOPProfileMAI_I32_I32_X4,    int_amdgcn_mfma_i32_16x16x16i8>;
-defm V_MFMA_F32_16X16X2BF16 : VOP3Inst<"v_mfma_f32_16x16x2bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_16x16x2bf16>;
-defm V_MFMA_F32_16X16X8BF16 : VOP3Inst<"v_mfma_f32_16x16x8bf16", VOPProfileMAI_F32_V2I16_X4,  int_amdgcn_mfma_f32_16x16x8bf16>;
-defm V_MFMA_F32_32X32X1F32  : VOP3Inst<"v_mfma_f32_32x32x1f32",  VOPProfileMAI_F32_F32_X32,   int_amdgcn_mfma_f32_32x32x1f32>;
-defm V_MFMA_F32_32X32X2F32  : VOP3Inst<"v_mfma_f32_32x32x2f32",  VOPProfileMAI_F32_F32_X16,   int_amdgcn_mfma_f32_32x32x2f32>;
-defm V_MFMA_F32_32X32X4F16  : VOP3Inst<"v_mfma_f32_32x32x4f16",  VOPProfileMAI_F32_V4F16_X32, int_amdgcn_mfma_f32_32x32x4f16>;
-defm V_MFMA_F32_32X32X8F16  : VOP3Inst<"v_mfma_f32_32x32x8f16",  VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_32x32x8f16>;
-defm V_MFMA_I32_32X32X4I8   : VOP3Inst<"v_mfma_i32_32x32x4i8",   VOPProfileMAI_I32_I32_X32,   int_amdgcn_mfma_i32_32x32x4i8>;
-defm V_MFMA_I32_32X32X8I8   : VOP3Inst<"v_mfma_i32_32x32x8i8",   VOPProfileMAI_I32_I32_X16,   int_amdgcn_mfma_i32_32x32x8i8>;
-defm V_MFMA_F32_32X32X2BF16 : VOP3Inst<"v_mfma_f32_32x32x2bf16", VOPProfileMAI_F32_V2I16_X32, int_amdgcn_mfma_f32_32x32x2bf16>;
-defm V_MFMA_F32_32X32X4BF16 : VOP3Inst<"v_mfma_f32_32x32x4bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_32x32x4bf16>;
-} // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1
+multiclass MAIInst<string OpName, string P, SDPatternOperator node> {
+  let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
+    // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
+    defm "" : VOP3Inst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), node>;
+
+    let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in
+    defm _vgprcd : VOP3Inst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>;
+  } // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1
+}
+
+defm V_MFMA_F32_4X4X1F32    : MAIInst<"v_mfma_f32_4x4x1f32",    "F32_F32_X4",    int_amdgcn_mfma_f32_4x4x1f32>;
+defm V_MFMA_F32_4X4X4F16    : MAIInst<"v_mfma_f32_4x4x4f16",    "F32_V4F16_X4",  int_amdgcn_mfma_f32_4x4x4f16>;
+defm V_MFMA_I32_4X4X4I8     : MAIInst<"v_mfma_i32_4x4x4i8",     "I32_I32_X4",    int_amdgcn_mfma_i32_4x4x4i8>;
+defm V_MFMA_F32_16X16X1F32  : MAIInst<"v_mfma_f32_16x16x1f32",  "F32_F32_X16",   int_amdgcn_mfma_f32_16x16x1f32>;
+defm V_MFMA_F32_16X16X4F32  : MAIInst<"v_mfma_f32_16x16x4f32",  "F32_F32_X4",    int_amdgcn_mfma_f32_16x16x4f32>;
+defm V_MFMA_F32_16X16X4F16  : MAIInst<"v_mfma_f32_16x16x4f16",  "F32_V4F16_X16", int_amdgcn_mfma_f32_16x16x4f16>;
+defm V_MFMA_F32_16X16X16F16 : MAIInst<"v_mfma_f32_16x16x16f16", "F32_V4F16_X4",  int_amdgcn_mfma_f32_16x16x16f16>;
+defm V_MFMA_I32_16X16X4I8   : MAIInst<"v_mfma_i32_16x16x4i8",   "I32_I32_X16",   int_amdgcn_mfma_i32_16x16x4i8>;
+defm V_MFMA_F32_32X32X1F32  : MAIInst<"v_mfma_f32_32x32x1f32",  "F32_F32_X32",   int_amdgcn_mfma_f32_32x32x1f32>;
+defm V_MFMA_F32_32X32X2F32  : MAIInst<"v_mfma_f32_32x32x2f32",  "F32_F32_X16",   int_amdgcn_mfma_f32_32x32x2f32>;
+defm V_MFMA_F32_32X32X4F16  : MAIInst<"v_mfma_f32_32x32x4f16",  "F32_V4F16_X32", int_amdgcn_mfma_f32_32x32x4f16>;
+defm V_MFMA_F32_32X32X8F16  : MAIInst<"v_mfma_f32_32x32x8f16",  "F32_V4F16_X16", int_amdgcn_mfma_f32_32x32x8f16>;
+defm V_MFMA_I32_32X32X4I8   : MAIInst<"v_mfma_i32_32x32x4i8",   "I32_I32_X32",   int_amdgcn_mfma_i32_32x32x4i8>;
+defm V_MFMA_I32_16X16X16I8  : MAIInst<"v_mfma_i32_16x16x16i8",  "I32_I32_X4",    int_amdgcn_mfma_i32_16x16x16i8>;
+defm V_MFMA_I32_32X32X8I8   : MAIInst<"v_mfma_i32_32x32x8i8",   "I32_I32_X16",   int_amdgcn_mfma_i32_32x32x8i8>;
+defm V_MFMA_F32_4X4X2BF16   : MAIInst<"v_mfma_f32_4x4x2bf16",   "F32_V2I16_X4",  int_amdgcn_mfma_f32_4x4x2bf16>;
+defm V_MFMA_F32_16X16X2BF16 : MAIInst<"v_mfma_f32_16x16x2bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_16x16x2bf16>;
+defm V_MFMA_F32_16X16X8BF16 : MAIInst<"v_mfma_f32_16x16x8bf16", "F32_V2I16_X4",  int_amdgcn_mfma_f32_16x16x8bf16>;
+defm V_MFMA_F32_32X32X2BF16 : MAIInst<"v_mfma_f32_32x32x2bf16", "F32_V2I16_X32", int_amdgcn_mfma_f32_32x32x2bf16>;
+defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_32x32x4bf16>;
 
 } // End SubtargetPredicate = HasMAIInsts
 
+let Predicates = [isGFX90APlus] in {
+  defm V_MFMA_F32_32X32X4BF16_1K  : MAIInst<"v_mfma_f32_32x32x4bf16_1k",  "F32_V4I16_X32",  int_amdgcn_mfma_f32_32x32x4bf16_1k>;
+  defm V_MFMA_F32_16X16X4BF16_1K  : MAIInst<"v_mfma_f32_16x16x4bf16_1k",  "F32_V4I16_X16",  int_amdgcn_mfma_f32_16x16x4bf16_1k>;
+  defm V_MFMA_F32_4X4X4BF16_1K    : MAIInst<"v_mfma_f32_4x4x4bf16_1k",    "F32_V4I16_X4",   int_amdgcn_mfma_f32_4x4x4bf16_1k>;
+  defm V_MFMA_F32_32X32X8BF16_1K  : MAIInst<"v_mfma_f32_32x32x8bf16_1k",  "F32_V4I16_X16",  int_amdgcn_mfma_f32_32x32x8bf16_1k>;
+  defm V_MFMA_F32_16X16X16BF16_1K : MAIInst<"v_mfma_f32_16x16x16bf16_1k", "F32_V4I16_X4",   int_amdgcn_mfma_f32_16x16x16bf16_1k>;
+
+  defm V_MFMA_F64_16X16X4F64      : MAIInst<"v_mfma_f64_16x16x4f64",      "F64_16X16X4F64", int_amdgcn_mfma_f64_16x16x4f64>;
+  defm V_MFMA_F64_4X4X4F64        : MAIInst<"v_mfma_f64_4x4x4f64",        "F64_4X4X4F64",   int_amdgcn_mfma_f64_4x4x4f64>;
+} // End Predicates = [isGFX90APlus]
+
+let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 in {
+  def V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>;
+  def V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>;
+  def V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>;
+  def V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;
+} // End SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1
+
 def : MnemonicAlias<"v_accvgpr_read",  "v_accvgpr_read_b32">;
 def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">;
 
@@ -435,7 +493,7 @@ multiclass VOP3P_Real_vi<bits<7> op> {
 
 multiclass VOP3P_Real_MAI<bits<7> op> {
   def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
-            VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+            VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, ?> {
     let AssemblerPredicate = HasMAIInsts;
     let DecoderNamespace = "GFX8";
     let Inst{14} = ?; // op_sel_hi(2)
@@ -444,9 +502,21 @@ multiclass VOP3P_Real_MAI<bits<7> op> {
   }
 }
 
-multiclass VOP3P_Real_MFMA<bits<7> op> {
+multiclass VOP3P_Real_MFMA_gfx90a<bits<7> op> {
+  let SubtargetPredicate = isGFX90AOnly,
+      AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A" in {
+  def _gfx90a_acd : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX90A>,
+             VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, 1>;
+
+  def _gfx90a_vcd : VOP3P_Real<!cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64"), SIEncodingFamily.GFX90A>,
+             VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64").Pfl, 0>;
+  } // End AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A"
+}
+
+multiclass VOP3P_Real_MFMA<bits<7> op> :
+  VOP3P_Real_MFMA_gfx90a <op> {
   def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
-            VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+            VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, ?> {
     let AssemblerPredicate = HasMAIInsts;
     let DecoderNamespace = "GFX8";
   }
@@ -536,6 +606,21 @@ defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MFMA <0x6d>;
 
 } // End SubtargetPredicate = HasMAIInsts
 
+defm V_MFMA_F32_32X32X4BF16_1K  : VOP3P_Real_MFMA_gfx90a <0x63>;
+defm V_MFMA_F32_16X16X4BF16_1K  : VOP3P_Real_MFMA_gfx90a <0x64>;
+defm V_MFMA_F32_4X4X4BF16_1K    : VOP3P_Real_MFMA_gfx90a <0x65>;
+defm V_MFMA_F32_32X32X8BF16_1K  : VOP3P_Real_MFMA_gfx90a <0x66>;
+defm V_MFMA_F32_16X16X16BF16_1K : VOP3P_Real_MFMA_gfx90a <0x67>;
+defm V_MFMA_F64_16X16X4F64      : VOP3P_Real_MFMA_gfx90a <0x6e>;
+defm V_MFMA_F64_4X4X4F64        : VOP3P_Real_MFMA_gfx90a <0x6f>;
+
+let SubtargetPredicate = HasPackedFP32Ops in {
+  defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>;
+  defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>;
+  defm V_PK_ADD_F32 : VOP3P_Real_vi <0x32>;
+  defm V_PK_MOV_B32 : VOP3P_Real_vi <0x33>;
+} // End SubtargetPredicate = HasPackedFP32Ops
+
 //===----------------------------------------------------------------------===//
 // GFX10.
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 282c1002d3c9..382b02144612 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -333,7 +333,7 @@ class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
   let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
 }
 
-class VOP3Pe_MAI <bits<7> op, VOPProfile P> : Enc64 {
+class VOP3Pe_MAI <bits<7> op, VOPProfile P, bit acc_cd = 0> : Enc64 {
   bits<8> vdst;
   bits<10> src0;
   bits<10> src1;
@@ -341,14 +341,13 @@ class VOP3Pe_MAI <bits<7> op, VOPProfile P> : Enc64 {
   bits<3> blgp;
   bits<3> cbsz;
   bits<4> abid;
-  bits<1> clamp;
 
   let Inst{7-0} = vdst;
 
   let Inst{10-8}  = !if(P.HasSrc1, cbsz, 0);
   let Inst{14-11} = !if(P.HasSrc1, abid, 0);
 
-  let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
+  let Inst{15} = acc_cd;
 
   let Inst{22-16} = op;
   let Inst{31-23} = 0x1a7; //encoding
@@ -628,8 +627,8 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
   string AsmOperands = P.AsmDPP;
 
   let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", "");
-  let SubtargetPredicate = HasDPP;
-  let AssemblerPredicate = HasDPP;
+  let SubtargetPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP);
+  let AssemblerPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP);
   let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
                                         AMDGPUAsmVariants.Disable);
   let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
@@ -683,8 +682,8 @@ class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16,
   let Size = 8;
 
   let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", "");
-  let SubtargetPredicate = HasDPP;
-  let AssemblerPredicate = HasDPP;
+  let SubtargetPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP);
+  let AssemblerPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP);
   let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
                                         AMDGPUAsmVariants.Disable);
   let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
index c2959bbc62e0..8eb1a0769170 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
@@ -2,6 +2,7 @@
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16,SIZEALL,ALL %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF16,SIZEALL,ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX90A-FASTF64,FASTF16,PACKEDF32,ALL %s
 
 ; ALL-LABEL: 'fadd_f32'
 ; ALL: estimated cost of 1 for {{.*}} fadd float
@@ -13,7 +14,8 @@ define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float addrspace(1)
 }
 
 ; ALL-LABEL: 'fadd_v2f32'
-; ALL: estimated cost of 2 for {{.*}} fadd <2 x float>
+; NOPACKEDF32: estimated cost of 2 for {{.*}} fadd <2 x float>
+; PACKEDF32: estimated cost of 1 for {{.*}} fadd <2 x float>
 define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fadd <2 x float> %vec, %b
@@ -22,7 +24,10 @@ define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float
 }
 
 ; ALL-LABEL: 'fadd_v3f32'
-; ALL: estimated cost of 3 for {{.*}} fadd <3 x float>
+; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
+; and 3 when it is legal.
+; NOPACKEDF32: estimated cost of {{[34]}} for {{.*}} fadd <3 x float>
+; PACKEDF32: estimated cost of 2 for {{.*}} fadd <3 x float>
 define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fadd <3 x float> %vec, %b
@@ -31,7 +36,10 @@ define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float
 }
 
 ; ALL-LABEL: 'fadd_v5f32'
-; ALL: estimated cost of 5 for {{.*}} fadd <5 x float>
+; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
+; and 5 when it is legal.
+; NOPACKEDF32: estimated cost of {{[58]}} for {{.*}} fadd <5 x float>
+; PACKEDF32: estimated cost of 3 for {{.*}} fadd <5 x float>
 define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
   %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
   %add = fadd <5 x float> %vec, %b
@@ -40,6 +48,7 @@ define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float
 }
 
 ; ALL-LABEL: 'fadd_f64'
+; GFX90A-FASTF64: estimated cost of 1 for {{.*}} fadd double
 ; FASTF64: estimated cost of 2 for {{.*}} fadd double
 ; SLOWF64: estimated cost of 4 for {{.*}} fadd double
 ; SIZEALL: estimated cost of 2 for {{.*}} fadd double
@@ -51,6 +60,7 @@ define amdgpu_kernel void @fadd_f64(double addrspace(1)* %out, double addrspace(
 }
 
 ; ALL-LABEL: 'fadd_v2f64'
+; GFX90A-FASTF64: estimated cost of 2 for {{.*}} fadd <2 x double>
 ; FASTF64: estimated cost of 4 for {{.*}} fadd <2 x double>
 ; SLOWF64: estimated cost of 8 for {{.*}} fadd <2 x double>
 ; SIZEALL: estimated cost of 4 for {{.*}} fadd <2 x double>
@@ -62,6 +72,7 @@ define amdgpu_kernel void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
 }
 
 ; ALL-LABEL: 'fadd_v3f64'
+; GFX90A-FASTF64: estimated cost of 3 for {{.*}} fadd <3 x double>
 ; FASTF64: estimated cost of 6 for {{.*}} fadd <3 x double>
 ; SLOWF64: estimated cost of 12 for {{.*}} fadd <3 x double>
 ; SIZEALL: estimated cost of 6 for {{.*}} fadd <3 x double>

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
index 41cbe0fac7f6..c90ca1412eff 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
@@ -2,6 +2,7 @@
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF32,SLOWF16,ALL %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZEF16 %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZENOF16 %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX90A-FASTF64,FASTF16,PACKEDF32,ALL %s
 
 ; ALL-LABEL: 'fma_f32'
 ; SLOWF32: estimated cost of 4 for {{.*}} call float @llvm.fma.f32
@@ -16,7 +17,7 @@ define amdgpu_kernel void @fma_f32(float addrspace(1)* %out, float addrspace(1)*
 
 ; ALL-LABEL: 'fma_v2f32'
 ; SLOWF32: estimated cost of 8 for {{.*}} call <2 x float> @llvm.fma.v2f32
-; FASTF32: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32
+; PACKEDF32: estimated cost of 2 for {{.*}} call <2 x float> @llvm.fma.v2f32
 ; SIZEALL: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32
 define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
@@ -27,7 +28,7 @@ define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float>
 
 ; ALL-LABEL: 'fma_v3f32'
 ; SLOWF32: estimated cost of 12 for {{.*}} call <3 x float> @llvm.fma.v3f32
-; FASTF32: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32
+; PACKEDF32: estimated cost of 4 for {{.*}} call <3 x float> @llvm.fma.v3f32
 ; SIZEALL: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32
 define amdgpu_kernel void @fma_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
@@ -38,7 +39,7 @@ define amdgpu_kernel void @fma_v3f32(<3 x float> addrspace(1)* %out, <3 x float>
 
 ; ALL-LABEL: 'fma_v5f32'
 ; SLOWF32: estimated cost of 20 for {{.*}} call <5 x float> @llvm.fma.v5f32
-; FASTF32: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32
+; PACKEDF32: estimated cost of 6 for {{.*}} call <5 x float> @llvm.fma.v5f32
 ; SIZEALL: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32
 define amdgpu_kernel void @fma_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr) #0 {
   %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
@@ -49,6 +50,7 @@ define amdgpu_kernel void @fma_v5f32(<5 x float> addrspace(1)* %out, <5 x float>
 
 ; ALL-LABEL: 'fma_f64'
 ; SLOWF64: estimated cost of 4 for {{.*}} call double @llvm.fma.f64
+; GFX90A-FASTF64: estimated cost of 1 for {{.*}} call double @llvm.fma.f64
 ; FASTF64: estimated cost of 2 for {{.*}} call double @llvm.fma.f64
 ; SIZEALL: estimated cost of 2 for {{.*}} call double @llvm.fma.f64
 define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
@@ -60,6 +62,7 @@ define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1
 
 ; ALL-LABEL: 'fma_v2f64'
 ; SLOWF64: estimated cost of 8 for {{.*}} call <2 x double> @llvm.fma.v2f64
+; GFX90A-FASTF64: estimated cost of 2 for {{.*}} call <2 x double> @llvm.fma.v2f64
 ; FASTF64: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64
 ; SIZEALL: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64
 define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
index ba855b2665cc..929a51229e5c 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
@@ -2,6 +2,7 @@
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,FASTF16 %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SLOWF16 %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX90A-FASTF64,FASTF16,PACKEDF32,ALL %s
 
 ; ALL-LABEL: 'fmul_f32'
 ; ALL: estimated cost of 1 for {{.*}} fmul float
@@ -13,7 +14,8 @@ define amdgpu_kernel void @fmul_f32(float addrspace(1)* %out, float addrspace(1)
 }
 
 ; ALL-LABEL: 'fmul_v2f32'
-; ALL: estimated cost of 2 for {{.*}} fmul <2 x float>
+; NOPACKEDF32: estimated cost of 2 for {{.*}} fmul <2 x float>
+; PACKEDF32: estimated cost of 1 for {{.*}} fmul <2 x float>
 define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fmul <2 x float> %vec, %b
@@ -22,7 +24,10 @@ define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float
 }
 
 ; ALL-LABEL: 'fmul_v3f32'
-; ALL: estimated cost of 3 for {{.*}} fmul <3 x float>
+; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
+; and 3 when it is legal.
+; NOPACKEDF32: estimated cost of {{[34]}} for {{.*}} fmul <3 x float>
+; PACKEDF32: estimated cost of 2 for {{.*}} fmul <3 x float>
 define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fmul <3 x float> %vec, %b
@@ -31,7 +36,10 @@ define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float
 }
 
 ; ALL-LABEL: 'fmul_v5f32'
-; ALL: estimated cost of 5 for {{.*}} fmul <5 x float>
+; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
+; and 5 when it is legal.
+; NOPACKEDF32: estimated cost of {{[58]}} for {{.*}} fmul <5 x float>
+; PACKEDF32: estimated cost of 3 for {{.*}} fmul <5 x float>
 define amdgpu_kernel void @fmul_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
   %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
   %add = fmul <5 x float> %vec, %b
@@ -40,6 +48,7 @@ define amdgpu_kernel void @fmul_v5f32(<5 x float> addrspace(1)* %out, <5 x float
 }
 
 ; ALL-LABEL: 'fmul_f64'
+; GFX90A-FASTF64: estimated cost of 1 for {{.*}} fmul double
 ; FASTF64: estimated cost of 2 for {{.*}} fmul double
 ; SLOWF64: estimated cost of 4 for {{.*}} fmul double
 ; SIZEALL: estimated cost of 2 for {{.*}} fmul double

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
new file mode 100644
index 000000000000..683d37da2008
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -0,0 +1,584 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel < %s -march=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A
+
+declare double @llvm.amdgcn.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i1)
+declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg)
+declare double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32 immarg)
+declare double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32, i32 immarg)
+declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32 immarg)
+declare double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32, i32 immarg)
+declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg)
+declare double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+declare double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+declare double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %ptr, double %data)
+declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %ptr, double %data)
+declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %ptr, double %data)
+declare double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* nocapture, double, i32, i32, i1)
+
+define amdgpu_kernel void @buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: buffer_atomic_add_noret_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen glc
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: buffer_atomic_add_rtn_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+  store double %ret, double* undef
+  ret void
+}
+
+define amdgpu_kernel void @buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) {
+; GFX90A-LABEL: buffer_atomic_add_rtn_f64_off4_slc:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s10, s[0:1], 0x3c
+; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
+  store double %ret, double addrspace(1)* %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen glc
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  store double %ret, double* undef
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) {
+; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s10, s[0:1], 0x3c
+; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
+  store double %ret, double addrspace(1)* %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen glc
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
+  store double %ret, double* undef
+  ret void
+}
+
+define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) {
+; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s10, s[0:1], 0x3c
+; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
+  store double %ret, double addrspace(1)* %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen glc
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  store double %ret, double* undef
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) {
+; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s10, s[0:1], 0x3c
+; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
+  store double %ret, double addrspace(1)* %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen glc
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
+  store double %ret, double* undef
+  ret void
+}
+
+define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) {
+; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s10, s[0:1], 0x3c
+; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
+  store double %ret, double addrspace(1)* %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen glc
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  store double %ret, double* undef
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) {
+; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s10, s[0:1], 0x3c
+; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
+  store double %ret, double addrspace(1)* %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen glc
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
+  store double %ret, double* undef
+  ret void
+}
+
+define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) {
+; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s10, s[0:1], 0x3c
+; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
+  store double %ret, double addrspace(1)* %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_f64_noret(double addrspace(1)* %ptr, double %data) {
+; GFX90A-LABEL: global_atomic_fadd_f64_noret:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmin_f64_noret(double addrspace(1)* %ptr, double %data) {
+; GFX90A-LABEL: global_atomic_fmin_f64_noret:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmax_f64_noret(double addrspace(1)* %ptr, double %data) {
+; GFX90A-LABEL: global_atomic_fmax_f64_noret:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  ret void
+}
+
+define double @global_atomic_fadd_f64_rtn(double addrspace(1)* %ptr, double %data) {
+; GFX90A-LABEL: global_atomic_fadd_f64_rtn:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+main_body:
+  %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  ret double %ret
+}
+
+define double @global_atomic_fmax_f64_rtn(double addrspace(1)* %ptr, double %data) {
+; GFX90A-LABEL: global_atomic_fmax_f64_rtn:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+main_body:
+  %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  ret double %ret
+}
+
+define double @global_atomic_fmin_f64_rtn(double addrspace(1)* %ptr, double %data) {
+; GFX90A-LABEL: global_atomic_fmin_f64_rtn:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+main_body:
+  %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  ret double %ret
+}
+
+define amdgpu_kernel void @flat_atomic_fadd_f64_noret(double* %ptr, double %data) {
+; GFX90A-LABEL: flat_atomic_fadd_f64_noret:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %ptr, double %data)
+  ret void
+}
+
+define double @flat_atomic_fadd_f64_rtn(double* %ptr, double %data) {
+; GFX90A-LABEL: flat_atomic_fadd_f64_rtn:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+main_body:
+  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %ptr, double %data)
+  ret double %ret
+}
+
+define amdgpu_kernel void @flat_atomic_fmin_f64_noret(double* %ptr, double %data) {
+; GFX90A-LABEL: flat_atomic_fmin_f64_noret:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %ptr, double %data)
+  ret void
+}
+
+define double @flat_atomic_fmin_f64_rtn(double* %ptr, double %data) {
+; GFX90A-LABEL: flat_atomic_fmin_f64_rtn:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+main_body:
+  %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %ptr, double %data)
+  ret double %ret
+}
+
+define amdgpu_kernel void @flat_atomic_fmax_f64_noret(double* %ptr, double %data) {
+; GFX90A-LABEL: flat_atomic_fmax_f64_noret:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %ptr, double %data)
+  ret void
+}
+
+define double @flat_atomic_fmax_f64_rtn(double* %ptr, double %data) {
+; GFX90A-LABEL: flat_atomic_fmax_f64_rtn:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+main_body:
+  %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %ptr, double %data)
+  ret double %ret
+}
+
+define amdgpu_kernel void @local_atomic_fadd_f64_noret(double addrspace(3)* %ptr, double %data) {
+; GFX90A-LABEL: local_atomic_fadd_f64_noret:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    ds_add_rtn_f64 v[0:1], v2, v[0:1]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* %ptr, double %data, i32 0, i32 0, i1 0)
+  ret void
+}
+
+define double @local_atomic_fadd_f64_rtn(double addrspace(3)* %ptr, double %data) {
+; GFX90A-LABEL: local_atomic_fadd_f64_rtn:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
+; GFX90A-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[4:5]
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+main_body:
+  %ret = call double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* %ptr, double %data, i32 0, i32 0, i1 0)
+  ret double %ret
+}
+
+define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(double addrspace(3)* %ptr) {
+; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    ds_add_rtn_f64 v[0:1], v2, v[0:1]
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst
+  ret void
+}
+
+define double @local_atomic_fadd_f64_rtn_pat(double addrspace(3)* %ptr, double %data) {
+; GFX90A-LABEL: local_atomic_fadd_f64_rtn_pat:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[2:3]
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+main_body:
+  %ret = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst
+  ret double %ret
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir
index e3c82ba6826a..c8a8116bb2d7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir
@@ -19,7 +19,7 @@ body:             |
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX7: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
@@ -27,7 +27,7 @@ body:             |
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX9: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX9: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX9: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
@@ -35,7 +35,7 @@ body:             |
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX10: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX10: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX10: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -71,7 +71,7 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX7: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX7: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gep4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
@@ -79,7 +79,7 @@ body:             |
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX9: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX9: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX9: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gep4
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
@@ -97,7 +97,7 @@ body:             |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX10: %12:vgpr_32, dead %14:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1
-    ; GFX10: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX10: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX10: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -125,7 +125,7 @@ body:             |
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX7: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
+    ; GFX7: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
     ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
@@ -133,7 +133,7 @@ body:             |
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX9: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
+    ; GFX9: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
     ; GFX9: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
@@ -141,7 +141,7 @@ body:             |
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX10: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
+    ; GFX10: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
     ; GFX10: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -177,7 +177,7 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX7: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
+    ; GFX7: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
     ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_gep4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
@@ -185,7 +185,7 @@ body:             |
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX9: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
+    ; GFX9: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
     ; GFX9: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_gep4
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
@@ -203,7 +203,7 @@ body:             |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX10: %12:vgpr_32, dead %14:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1
-    ; GFX10: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
+    ; GFX10: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
     ; GFX10: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -241,7 +241,7 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX7: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX7: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gepm4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
@@ -259,7 +259,7 @@ body:             |
     ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX9: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1
-    ; GFX9: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX9: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX9: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gepm4
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
@@ -277,7 +277,7 @@ body:             |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX10: %12:vgpr_32, dead %14:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1
-    ; GFX10: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX10: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX10: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -305,21 +305,21 @@ body:             |
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX7: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX9: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX9: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX10: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX10: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s32) = COPY $vgpr3
@@ -343,21 +343,21 @@ body:             |
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX7: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
+    ; GFX7: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX9: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
+    ; GFX9: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX10: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
+    ; GFX10: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     %2:vgpr(s64) = COPY $vgpr4_vgpr5

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
index 9a3b922b79a5..0b7f99980557 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
@@ -49,7 +49,7 @@ body:             |
     ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
@@ -57,7 +57,7 @@ body:             |
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
@@ -65,7 +65,7 @@ body:             |
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]]
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_global
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
@@ -73,7 +73,7 @@ body:             |
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -137,7 +137,7 @@ body:             |
     ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX7-FLAT: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7-FLAT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1
-    ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gep4
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
@@ -155,7 +155,7 @@ body:             |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX8: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1
-    ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gep4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
@@ -163,7 +163,7 @@ body:             |
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]]
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gep4
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
@@ -171,7 +171,7 @@ body:             |
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -227,7 +227,7 @@ body:             |
     ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
     ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
     ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s64_global
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
@@ -235,7 +235,7 @@ body:             |
     ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX8: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
+    ; GFX8: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
     ; GFX8: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_global
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
@@ -243,7 +243,7 @@ body:             |
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
     ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]]
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s64_global
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
@@ -251,7 +251,7 @@ body:             |
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
     ; GFX10: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -315,7 +315,7 @@ body:             |
     ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX7-FLAT: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7-FLAT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1
-    ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
     ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
     ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s64_global_gep4
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
@@ -333,7 +333,7 @@ body:             |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX8: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1
-    ; GFX8: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
+    ; GFX8: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
     ; GFX8: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_global_gep4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
@@ -341,7 +341,7 @@ body:             |
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
     ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]]
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s64_global_gep4
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
@@ -349,7 +349,7 @@ body:             |
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
     ; GFX10: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -435,7 +435,7 @@ body:             |
     ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX7-FLAT: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7-FLAT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1
-    ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gepm4
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
@@ -453,7 +453,7 @@ body:             |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX8: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1
-    ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gepm4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
@@ -461,7 +461,7 @@ body:             |
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], -4, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], -4, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]]
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gepm4
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
@@ -469,7 +469,7 @@ body:             |
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], -4, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], -4, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -523,28 +523,28 @@ body:             |
     ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_nortn
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s32) = COPY $vgpr3
@@ -594,28 +594,28 @@ body:             |
     ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
     ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s64_global_nortn
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX8: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
+    ; GFX8: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_global_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s64_global_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     %2:vgpr(s64) = COPY $vgpr4_vgpr5
@@ -666,7 +666,7 @@ body:             |
     ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX7-FLAT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY]]
-    ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY3]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY3]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr
     ; GFX8: liveins: $sgpr0_sgpr1, $vgpr2, $vgpr3
@@ -675,7 +675,7 @@ body:             |
     ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX8: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY]]
-    ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY3]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY3]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr
     ; GFX9: liveins: $sgpr0_sgpr1, $vgpr2, $vgpr3
@@ -684,7 +684,7 @@ body:             |
     ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN]]
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr
     ; GFX10: liveins: $sgpr0_sgpr1, $vgpr2, $vgpr3
@@ -693,7 +693,7 @@ body:             |
     ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -756,7 +756,7 @@ body:             |
     ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; GFX7-FLAT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX7-FLAT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-    ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE2]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE2]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr_offset_4095
     ; GFX8: liveins: $sgpr0_sgpr1, $vgpr2, $vgpr3
@@ -775,7 +775,7 @@ body:             |
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; GFX8: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX8: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-    ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE2]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE2]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr_offset_4095
     ; GFX9: liveins: $sgpr0_sgpr1, $vgpr2, $vgpr3
@@ -784,7 +784,7 @@ body:             |
     ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 4095, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 4095, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN]]
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr_offset_4095
     ; GFX10: liveins: $sgpr0_sgpr1, $vgpr2, $vgpr3
@@ -793,7 +793,7 @@ body:             |
     ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 2047, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 2047, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:vgpr(s32) = COPY $vgpr2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir
index 6d568a57df28..1362980492a7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir
@@ -17,19 +17,19 @@ body:             |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX9: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX10: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -51,17 +51,17 @@ body:             |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s32) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst 4, addrspace 0)
@@ -91,13 +91,13 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2047
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX9: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2047
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
@@ -113,7 +113,7 @@ body:             |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX10: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -147,12 +147,12 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2047_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2047_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -167,7 +167,7 @@ body:             |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 2047
@@ -199,13 +199,13 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2048
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX9: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2048
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
@@ -221,7 +221,7 @@ body:             |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX10: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -255,12 +255,12 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2048_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2048_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -275,7 +275,7 @@ body:             |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 2048
@@ -307,13 +307,13 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4095
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX9: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4095
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
@@ -329,7 +329,7 @@ body:             |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX10: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -363,12 +363,12 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4095_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4095_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -383,7 +383,7 @@ body:             |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 4095
@@ -415,7 +415,7 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4097
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
@@ -431,7 +431,7 @@ body:             |
     ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX9: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX9: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4097
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
@@ -447,7 +447,7 @@ body:             |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX10: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -481,7 +481,7 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4097_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -496,7 +496,7 @@ body:             |
     ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX9: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4097_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -511,7 +511,7 @@ body:             |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
+    ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4)
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 4097
@@ -533,19 +533,19 @@ body:             |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
+    ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
     ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
     ; GFX9-LABEL: name: flat_atomicrmw_add_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
+    ; GFX9: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
     ; GFX9: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
     ; GFX10-LABEL: name: flat_atomicrmw_add_s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
+    ; GFX10: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
     ; GFX10: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -567,17 +567,17 @@ body:             |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
+    ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
     ; GFX9-LABEL: name: flat_atomicrmw_add_s64_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
+    ; GFX9: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
     ; GFX10-LABEL: name: flat_atomicrmw_add_s64_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
+    ; GFX10: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     %2:vgpr(s64) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst 8, addrspace 0)
@@ -607,13 +607,13 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
+    ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
     ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
     ; GFX9-LABEL: name: flat_atomicrmw_add_s64_offset4095
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
+    ; GFX9: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
     ; GFX9: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
     ; GFX10-LABEL: name: flat_atomicrmw_add_s64_offset4095
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
@@ -629,7 +629,7 @@ body:             |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX10: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
+    ; GFX10: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
     ; GFX10: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -663,12 +663,12 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
+    ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
     ; GFX9-LABEL: name: flat_atomicrmw_add_s64_offset4095_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
+    ; GFX9: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
     ; GFX10-LABEL: name: flat_atomicrmw_add_s64_offset4095_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -683,7 +683,7 @@ body:             |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX10: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
+    ; GFX10: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8)
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     %2:vgpr(s64) = G_CONSTANT i64 4095

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir
index d6f65471d7db..97d2dbe584df 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir
@@ -18,7 +18,7 @@ body:             |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     ; GFX6-LABEL: name: global_atomicrmw_add_s32
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
@@ -35,13 +35,13 @@ body:             |
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
     ; GFX10-LABEL: name: global_atomicrmw_add_s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -63,7 +63,7 @@ body:             |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX6-LABEL: name: global_atomicrmw_add_s32_nortn
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -78,12 +78,12 @@ body:             |
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s32) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst 4, addrspace 1)
@@ -113,7 +113,7 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset2047
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
@@ -130,13 +130,13 @@ body:             |
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2047
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -170,7 +170,7 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -185,12 +185,12 @@ body:             |
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 2047
@@ -222,7 +222,7 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset2048
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
@@ -239,7 +239,7 @@ body:             |
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2048
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
@@ -255,7 +255,7 @@ body:             |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -289,7 +289,7 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -304,7 +304,7 @@ body:             |
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -319,7 +319,7 @@ body:             |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 2048
@@ -351,7 +351,7 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset4095
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
@@ -368,7 +368,7 @@ body:             |
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4095
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
@@ -384,7 +384,7 @@ body:             |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -418,7 +418,7 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -433,7 +433,7 @@ body:             |
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -448,7 +448,7 @@ body:             |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 4095
@@ -480,7 +480,7 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset4097
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
@@ -508,7 +508,7 @@ body:             |
     ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX9: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4097
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
@@ -524,7 +524,7 @@ body:             |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -558,7 +558,7 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
+    ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1)
     ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -584,7 +584,7 @@ body:             |
     ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX9: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -599,7 +599,7 @@ body:             |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 4097
@@ -621,7 +621,7 @@ body:             |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
+    ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
     ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
     ; GFX6-LABEL: name: global_atomicrmw_add_s64
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
@@ -638,13 +638,13 @@ body:             |
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
     ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]]
     ; GFX10-LABEL: name: global_atomicrmw_add_s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
     ; GFX10: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -666,7 +666,7 @@ body:             |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
+    ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
     ; GFX6-LABEL: name: global_atomicrmw_add_s64_nortn
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -681,12 +681,12 @@ body:             |
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
     ; GFX10-LABEL: name: global_atomicrmw_add_s64_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     %2:vgpr(s64) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst 8, addrspace 1)
@@ -716,7 +716,7 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
+    ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
     ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
     ; GFX6-LABEL: name: global_atomicrmw_add_s64_offset4095
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
@@ -733,7 +733,7 @@ body:             |
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
     ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]]
     ; GFX10-LABEL: name: global_atomicrmw_add_s64_offset4095
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
@@ -749,7 +749,7 @@ body:             |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX10: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
     ; GFX10: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -783,7 +783,7 @@ body:             |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
+    ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1)
     ; GFX6-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -798,7 +798,7 @@ body:             |
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
+    ; GFX9: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
     ; GFX10-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -813,7 +813,7 @@ body:             |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1
-    ; GFX10: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
+    ; GFX10: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     %2:vgpr(s64) = G_CONSTANT i64 4095

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir
index d025784307df..32a5fe23ebbb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir
@@ -17,12 +17,12 @@ body: |
     ; WAVE64: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3
     ; WAVE64: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[COPY]]
     ; WAVE64: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; WAVE64: FLAT_STORE_DWORD [[COPY1]], [[DEF]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; WAVE64: FLAT_STORE_DWORD [[COPY1]], [[DEF]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; WAVE32-LABEL: name: copy
     ; WAVE32: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3
     ; WAVE32: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], [[DEF]], [[COPY]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    ; WAVE32: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], [[DEF]], [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     %0:sgpr(p1) = COPY $sgpr2_sgpr3
     %1:vgpr(p1) = COPY %0
     %2:vgpr(s32) = G_IMPLICIT_DEF
@@ -46,7 +46,7 @@ body: |
     ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[COPY3]], implicit-def $scc
     ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
     ; WAVE64: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY2]], 0, [[COPY1]], [[V_CMP_NE_U32_e64_]], implicit $exec
-    ; WAVE64: FLAT_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; WAVE64: FLAT_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; WAVE32-LABEL: name: copy_vcc_bank_sgpr_bank
     ; WAVE32: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
@@ -55,7 +55,7 @@ body: |
     ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[COPY3]], implicit-def $scc
     ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
     ; WAVE32: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY2]], 0, [[COPY1]], [[V_CMP_NE_U32_e64_]], implicit $exec
-    ; WAVE32: GLOBAL_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    ; WAVE32: GLOBAL_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s32) = COPY $vgpr3
@@ -85,7 +85,7 @@ body: |
     ; WAVE64: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[COPY3]], implicit-def $scc
     ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_1]], implicit $exec
     ; WAVE64: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_]], 0, [[COPY1]], [[V_CMP_NE_U32_e64_1]], implicit $exec
-    ; WAVE64: FLAT_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; WAVE64: FLAT_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; WAVE32-LABEL: name: copy_vcc_bank_sgpr_bank_2_uses
     ; WAVE32: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
@@ -96,7 +96,7 @@ body: |
     ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[COPY3]], implicit-def $scc
     ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
     ; WAVE32: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_]], 0, [[COPY1]], [[V_CMP_NE_U32_e64_]], implicit $exec
-    ; WAVE32: GLOBAL_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_1]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    ; WAVE32: GLOBAL_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_1]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s32) = COPY $vgpr3
@@ -124,14 +124,14 @@ body: |
     ; WAVE64: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; WAVE64: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY $scc
     ; WAVE64: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY2]], 0, [[COPY1]], [[COPY3]], implicit $exec
-    ; WAVE64: FLAT_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; WAVE64: FLAT_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; WAVE32-LABEL: name: copy_vcc_bank_scc_physreg
     ; WAVE32: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; WAVE32: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; WAVE32: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc
     ; WAVE32: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY2]], 0, [[COPY1]], [[COPY3]], implicit $exec
-    ; WAVE32: GLOBAL_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    ; WAVE32: GLOBAL_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s32) = COPY $vgpr3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.mir
index eb15c14fac8b..ce4a2003551d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.mir
@@ -24,9 +24,9 @@ body: |
     ; GFX7: %7:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %8:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %9:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: %10:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %11:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %12:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec
@@ -92,9 +92,9 @@ body: |
     ; GFX7: %7:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %8:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %9:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: %10:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %11:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %12:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.mir
index 443430137001..a41774bc6c39 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.mir
@@ -25,9 +25,9 @@ body: |
     ; GFX7: %7:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %8:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %9:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: %10:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %11:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %12:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec
@@ -91,9 +91,9 @@ body: |
     ; GFX7: %7:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %8:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %9:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: %10:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %11:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %12:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.mir
index 4261b349bd13..8f65349d9056 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.mir
@@ -24,9 +24,9 @@ body: |
     ; GFX7: %7:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %8:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %9:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: %10:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %11:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %12:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec
@@ -92,9 +92,9 @@ body: |
     ; GFX7: %7:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %8:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %9:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: %10:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %11:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %12:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.mir
index 682120d871ff..a1381052ecc3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.mir
@@ -25,9 +25,9 @@ body: |
     ; GFX7: %7:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %8:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %9:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: %10:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %11:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %12:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec
@@ -91,9 +91,9 @@ body: |
     ; GFX7: %7:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %8:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %9:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: %10:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %11:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: %12:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir
index d520bb3faca5..9942493d12a5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir
@@ -18,9 +18,9 @@ body: |
     ; GCN: %4:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: %5:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GCN: %6:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
-    ; GCN: FLAT_STORE_DWORD [[COPY3]], %4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GCN: FLAT_STORE_DWORD [[COPY3]], %5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GCN: FLAT_STORE_DWORD [[COPY3]], %6, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GCN: FLAT_STORE_DWORD [[COPY3]], %4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GCN: FLAT_STORE_DWORD [[COPY3]], %5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GCN: FLAT_STORE_DWORD [[COPY3]], %6, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
     %2:vgpr(s32) = COPY $vgpr1
@@ -133,16 +133,16 @@ body: |
     ; GCN: %13:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 3, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: %14:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 3, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: %15:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN: FLAT_STORE_DWORD [[COPY1]], %6, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GCN: FLAT_STORE_DWORD [[COPY1]], %7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GCN: FLAT_STORE_DWORD [[COPY1]], %8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GCN: FLAT_STORE_DWORD [[COPY1]], %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GCN: FLAT_STORE_DWORD [[COPY1]], %10, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GCN: FLAT_STORE_DWORD [[COPY1]], %11, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GCN: FLAT_STORE_DWORD [[COPY1]], %12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GCN: FLAT_STORE_DWORD [[COPY1]], %13, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GCN: FLAT_STORE_DWORD [[COPY1]], %14, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GCN: FLAT_STORE_DWORD [[COPY1]], %15, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GCN: FLAT_STORE_DWORD [[COPY1]], %6, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GCN: FLAT_STORE_DWORD [[COPY1]], %7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GCN: FLAT_STORE_DWORD [[COPY1]], %8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GCN: FLAT_STORE_DWORD [[COPY1]], %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GCN: FLAT_STORE_DWORD [[COPY1]], %10, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GCN: FLAT_STORE_DWORD [[COPY1]], %11, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GCN: FLAT_STORE_DWORD [[COPY1]], %12, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GCN: FLAT_STORE_DWORD [[COPY1]], %13, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GCN: FLAT_STORE_DWORD [[COPY1]], %14, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GCN: FLAT_STORE_DWORD [[COPY1]], %15, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(p1) = COPY $vgpr2_vgpr3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir
index a91c1cf331fe..74d1f9d96d73 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir
@@ -18,16 +18,16 @@ body: |
     ; GCN: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4
     ; GCN: %3:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: %4:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-    ; GCN: FLAT_STORE_DWORD [[COPY2]], %3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GCN: FLAT_STORE_DWORD [[COPY2]], %4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GCN: FLAT_STORE_DWORD [[COPY2]], %3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GCN: FLAT_STORE_DWORD [[COPY2]], %4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; VI-LABEL: name: fptoui
     ; VI: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; VI: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; VI: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4
     ; VI: %3:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; VI: %4:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-    ; VI: FLAT_STORE_DWORD [[COPY2]], %3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; VI: FLAT_STORE_DWORD [[COPY2]], %4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; VI: FLAT_STORE_DWORD [[COPY2]], %3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; VI: FLAT_STORE_DWORD [[COPY2]], %4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     %0:sgpr(s32) = COPY $sgpr0
 
     %1:vgpr(s32) = COPY $vgpr0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
index 164ae8e086de..adba00b6cae3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
@@ -26,7 +26,7 @@ body: |
     ; CHECK: %12:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: %15:vreg_64 = nofpexcept V_FRACT_F64_e64 0, %12, 0, 0, implicit $mode, implicit $exec
     ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; CHECK: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], %15, [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
+    ; CHECK: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], %15, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
     ; CHECK: S_ENDPGM 0
     %2:sgpr(p4) = COPY $sgpr0_sgpr1
     %7:sgpr(s64) = G_CONSTANT i64 36
@@ -75,7 +75,7 @@ body: |
     ; CHECK: %13:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: %16:vreg_64 = nofpexcept V_FRACT_F64_e64 0, %13, 0, 0, implicit $mode, implicit $exec
     ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; CHECK: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], %16, [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
+    ; CHECK: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], %16, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
     ; CHECK: S_ENDPGM 0
     %2:sgpr(p4) = COPY $sgpr0_sgpr1
     %7:sgpr(s64) = G_CONSTANT i64 36

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-implicit-def.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-implicit-def.mir
index 4699652fa4bb..623b5c341886 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-implicit-def.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-implicit-def.mir
@@ -99,7 +99,7 @@ body: |
     ; GCN-LABEL: name: implicit_def_p1_vgpr
     ; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
-    ; GCN: FLAT_STORE_DWORD [[DEF]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GCN: FLAT_STORE_DWORD [[DEF]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     %0:vgpr(p1) = G_IMPLICIT_DEF
     %1:vgpr(s32) = G_CONSTANT i32 4
     G_STORE %1, %0 :: (store 4, addrspace 1)
@@ -117,7 +117,7 @@ body: |
     ; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
     ; GCN: $m0 = S_MOV_B32 -1
-    ; GCN: FLAT_STORE_DWORD [[DEF]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GCN: FLAT_STORE_DWORD [[DEF]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     %0:vgpr(p3) = G_IMPLICIT_DEF
     %1:vgpr(s32) = G_CONSTANT i32 4
     G_STORE %1, %0 :: (store 4, addrspace 1)
@@ -134,7 +134,7 @@ body: |
     ; GCN-LABEL: name: implicit_def_p4_vgpr
     ; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
-    ; GCN: FLAT_STORE_DWORD [[DEF]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GCN: FLAT_STORE_DWORD [[DEF]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     %0:vgpr(p4) = G_IMPLICIT_DEF
     %1:vgpr(s32) = G_CONSTANT i32 4
     G_STORE %1, %0 :: (store 4, addrspace 1)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
index d31d8ac361ec..d156a99766b9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
@@ -16,12 +16,12 @@ body: |
     ; GFX7-LABEL: name: load_atomic_flat_s32_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4)
+    ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX9-LABEL: name: load_atomic_flat_s32_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4)
+    ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4)
     ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = G_LOAD %0 :: (load seq_cst 4, align 4, addrspace 0)
@@ -97,12 +97,12 @@ body: |
     ; GFX7-LABEL: name: load_atomic_flat_s64_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 8)
+    ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 8)
     ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX9-LABEL: name: load_atomic_flat_s64_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 8)
+    ; GFX9: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 8)
     ; GFX9: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_LOAD %0 :: (load seq_cst 8, align 8, addrspace 0)
@@ -242,7 +242,7 @@ body: |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4)
+    ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX9-LABEL: name: load_atomic_flat_s32_seq_cst_gep_m2048
     ; GFX9: liveins: $vgpr0_vgpr1
@@ -257,7 +257,7 @@ body: |
     ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4)
+    ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4)
     ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -2048
@@ -291,12 +291,12 @@ body: |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4)
+    ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX9-LABEL: name: load_atomic_flat_s32_seq_cst_gep_4095
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 4095, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4)
+    ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 4095, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4)
     ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 4095

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
index f1d6947b9a6a..f6ab8088ee65 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
@@ -23,7 +23,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
     ; GFX7-LABEL: name: load_atomic_global_s32_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -33,17 +33,17 @@ body: |
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_atomic_global_s32_seq_cst
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX9-LABEL: name: load_atomic_global_s32_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = G_LOAD %0 :: (load seq_cst 4, align 4, addrspace 1)
@@ -144,7 +144,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1)
     ; GFX6: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
     ; GFX7-LABEL: name: load_atomic_global_s64_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -154,17 +154,17 @@ body: |
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1)
     ; GFX7: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_atomic_global_s64_seq_cst
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 8, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 8, addrspace 1)
     ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX9-LABEL: name: load_atomic_global_s64_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1)
     ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_LOAD %0 :: (load seq_cst 8, align 8, addrspace 1)
@@ -349,7 +349,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
     ; GFX7-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -369,7 +369,7 @@ body: |
     ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
@@ -384,12 +384,12 @@ body: |
     ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX9-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], -2048, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], -2048, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -2048
@@ -418,7 +418,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
     ; GFX7-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -428,7 +428,7 @@ body: |
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
@@ -443,12 +443,12 @@ body: |
     ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX9-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4095, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4095, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 4095
@@ -487,7 +487,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1)
     ; GFX6: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
     ; GFX7-LABEL: name: load_atomic_global_s64_seq_cst_gep_m2048
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -507,7 +507,7 @@ body: |
     ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1)
     ; GFX7: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_atomic_global_s64_seq_cst_gep_m2048
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
@@ -522,12 +522,12 @@ body: |
     ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 8, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 8, addrspace 1)
     ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX9-LABEL: name: load_atomic_global_s64_seq_cst_gep_m2048
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], -2048, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], -2048, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1)
     ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -2048

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
index 5e2c0066ec06..0d23dfeda9ab 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
@@ -19,22 +19,22 @@ body: |
     ; GFX7-LABEL: name: load_flat_s32_from_4
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX8-LABEL: name: load_flat_s32_from_4
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX9-LABEL: name: load_flat_s32_from_4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
     ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX10-LABEL: name: load_flat_s32_from_4
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    ; GFX10: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
     ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 0)
@@ -56,22 +56,22 @@ body: |
     ; GFX7-LABEL: name: load_flat_s32_from_2
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2)
+    ; GFX7: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
     ; GFX8-LABEL: name: load_flat_s32_from_2
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2)
+    ; GFX8: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
     ; GFX9-LABEL: name: load_flat_s32_from_2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2)
+    ; GFX9: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2)
     ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
     ; GFX10-LABEL: name: load_flat_s32_from_2
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2)
+    ; GFX10: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2)
     ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = G_LOAD %0 :: (load 2, align 2, addrspace 0)
@@ -93,22 +93,22 @@ body: |
     ; GFX7-LABEL: name: load_flat_s32_from_1
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_flat_s32_from_1
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_flat_s32_from_1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_flat_s32_from_1
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 0)
@@ -129,19 +129,19 @@ body: |
 
     ; GFX7-LABEL: name: load_flat_v2s32
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
+    ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
     ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX8-LABEL: name: load_flat_v2s32
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
+    ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
     ; GFX8: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX9-LABEL: name: load_flat_v2s32
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
+    ; GFX9: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
     ; GFX9: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX10-LABEL: name: load_flat_v2s32
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
+    ; GFX10: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
     ; GFX10: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load 8, align 8, addrspace 0)
@@ -163,22 +163,22 @@ body: |
     ; GFX7-LABEL: name: load_flat_v3s32
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4)
+    ; GFX7: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4)
     ; GFX7: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]]
     ; GFX8-LABEL: name: load_flat_v3s32
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4)
+    ; GFX8: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4)
     ; GFX8: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]]
     ; GFX9-LABEL: name: load_flat_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4)
+    ; GFX9: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4)
     ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]]
     ; GFX10-LABEL: name: load_flat_v3s32
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4)
+    ; GFX10: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4)
     ; GFX10: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<3 x  s32>) = G_LOAD %0 :: (load 12, align 4, addrspace 0)
@@ -200,22 +200,22 @@ body: |
     ; GFX7-LABEL: name: load_flat_v4s32
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4)
+    ; GFX7: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4)
     ; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
     ; GFX8-LABEL: name: load_flat_v4s32
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4)
+    ; GFX8: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4)
     ; GFX8: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
     ; GFX9-LABEL: name: load_flat_v4s32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4)
+    ; GFX9: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4)
     ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
     ; GFX10-LABEL: name: load_flat_v4s32
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4)
+    ; GFX10: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4)
     ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<4 x  s32>) = G_LOAD %0 :: (load 16, align 4, addrspace 0)
@@ -237,22 +237,22 @@ body: |
     ; GFX7-LABEL: name: load_flat_s64
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
+    ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
     ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX8-LABEL: name: load_flat_s64
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
+    ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
     ; GFX8: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX9-LABEL: name: load_flat_s64
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
+    ; GFX9: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
     ; GFX9: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX10-LABEL: name: load_flat_s64
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
+    ; GFX10: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
     ; GFX10: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_LOAD %0 :: (load 8, align 8, addrspace 0)
@@ -274,22 +274,22 @@ body: |
     ; GFX7-LABEL: name: load_flat_v2s64
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4)
+    ; GFX7: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4)
     ; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
     ; GFX8-LABEL: name: load_flat_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4)
+    ; GFX8: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4)
     ; GFX8: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
     ; GFX9-LABEL: name: load_flat_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4)
+    ; GFX9: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4)
     ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
     ; GFX10-LABEL: name: load_flat_v2s64
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4)
+    ; GFX10: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4)
     ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s64>) = G_LOAD %0 :: (load 16, align 4, addrspace 0)
@@ -422,22 +422,22 @@ body: |
     ; GFX7-LABEL: name: load_flat_p3_from_4
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX8-LABEL: name: load_flat_p3_from_4
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX9-LABEL: name: load_flat_p3_from_4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
     ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX10-LABEL: name: load_flat_p3_from_4
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    ; GFX10: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
     ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = G_LOAD %0 :: (load 4, align 4, addrspace 0)
@@ -459,22 +459,22 @@ body: |
     ; GFX7-LABEL: name: load_flat_p1_from_8
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
+    ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
     ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX8-LABEL: name: load_flat_p1_from_8
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
+    ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
     ; GFX8: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX9-LABEL: name: load_flat_p1_from_8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
+    ; GFX9: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
     ; GFX9: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX10-LABEL: name: load_flat_p1_from_8
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
+    ; GFX10: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
     ; GFX10: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(p1) = G_LOAD %0 :: (load 8, align 8, addrspace 0)
@@ -566,22 +566,22 @@ body: |
     ; GFX7-LABEL: name: load_flat_v2s16
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX8-LABEL: name: load_flat_v2s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX9-LABEL: name: load_flat_v2s16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
     ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX10-LABEL: name: load_flat_v2s16
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    ; GFX10: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
     ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load 4, align 4, addrspace 0)
@@ -603,22 +603,22 @@ body: |
     ; GFX7-LABEL: name: load_flat_v4s16
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
+    ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
     ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX8-LABEL: name: load_flat_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
+    ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
     ; GFX8: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX9-LABEL: name: load_flat_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
+    ; GFX9: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
     ; GFX9: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX10-LABEL: name: load_flat_v4s16
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
+    ; GFX10: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8)
     ; GFX10: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<4 x s16>) = G_LOAD %0 :: (load 8, align 8, addrspace 0)
@@ -728,7 +728,7 @@ body: |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_2047
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -743,12 +743,12 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_2047
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2047, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2047, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_2047
     ; GFX10: liveins: $vgpr0_vgpr1
@@ -763,7 +763,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 2047
@@ -797,7 +797,7 @@ body: |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_2048
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -812,12 +812,12 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_2048
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2048, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2048, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_2048
     ; GFX10: liveins: $vgpr0_vgpr1
@@ -832,7 +832,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 2048
@@ -866,7 +866,7 @@ body: |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m2047
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -881,7 +881,7 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m2047
     ; GFX9: liveins: $vgpr0_vgpr1
@@ -896,7 +896,7 @@ body: |
     ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m2047
     ; GFX10: liveins: $vgpr0_vgpr1
@@ -911,7 +911,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -2047
@@ -945,7 +945,7 @@ body: |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m2048
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -960,7 +960,7 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m2048
     ; GFX9: liveins: $vgpr0_vgpr1
@@ -975,7 +975,7 @@ body: |
     ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m2048
     ; GFX10: liveins: $vgpr0_vgpr1
@@ -990,7 +990,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -2048
@@ -1024,7 +1024,7 @@ body: |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_4095
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -1039,12 +1039,12 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_4095
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 4095, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 4095, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_4095
     ; GFX10: liveins: $vgpr0_vgpr1
@@ -1059,7 +1059,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 4095
@@ -1093,7 +1093,7 @@ body: |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_4096
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -1108,7 +1108,7 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_4096
     ; GFX9: liveins: $vgpr0_vgpr1
@@ -1123,7 +1123,7 @@ body: |
     ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_4096
     ; GFX10: liveins: $vgpr0_vgpr1
@@ -1138,7 +1138,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 4096
@@ -1172,7 +1172,7 @@ body: |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m4095
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -1187,7 +1187,7 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m4095
     ; GFX9: liveins: $vgpr0_vgpr1
@@ -1202,7 +1202,7 @@ body: |
     ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m4095
     ; GFX10: liveins: $vgpr0_vgpr1
@@ -1217,7 +1217,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -4095
@@ -1251,7 +1251,7 @@ body: |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m4096
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -1266,7 +1266,7 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m4096
     ; GFX9: liveins: $vgpr0_vgpr1
@@ -1281,7 +1281,7 @@ body: |
     ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m4096
     ; GFX10: liveins: $vgpr0_vgpr1
@@ -1296,7 +1296,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -4096
@@ -1330,7 +1330,7 @@ body: |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_8191
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -1345,7 +1345,7 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_8191
     ; GFX9: liveins: $vgpr0_vgpr1
@@ -1360,7 +1360,7 @@ body: |
     ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_8191
     ; GFX10: liveins: $vgpr0_vgpr1
@@ -1375,7 +1375,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 8191
@@ -1409,7 +1409,7 @@ body: |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_8192
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -1424,7 +1424,7 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_8192
     ; GFX9: liveins: $vgpr0_vgpr1
@@ -1439,7 +1439,7 @@ body: |
     ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_8192
     ; GFX10: liveins: $vgpr0_vgpr1
@@ -1454,7 +1454,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 8192
@@ -1488,7 +1488,7 @@ body: |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m8191
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -1503,7 +1503,7 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m8191
     ; GFX9: liveins: $vgpr0_vgpr1
@@ -1518,7 +1518,7 @@ body: |
     ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m8191
     ; GFX10: liveins: $vgpr0_vgpr1
@@ -1533,7 +1533,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -8191
@@ -1567,7 +1567,7 @@ body: |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m8192
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -1582,7 +1582,7 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m8192
     ; GFX9: liveins: $vgpr0_vgpr1
@@ -1597,7 +1597,7 @@ body: |
     ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m8192
     ; GFX10: liveins: $vgpr0_vgpr1
@@ -1612,7 +1612,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
+    ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1)
     ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -8192

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
index ce3272e76d30..2c96d673d072 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
@@ -16,13 +16,13 @@ body: |
     ; GFX9: liveins: $sgpr0_sgpr1
     ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     ; GFX10-LABEL: name: load_global_s32_from_sgpr
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:vgpr(p1) = COPY %0
@@ -47,13 +47,13 @@ body: |
     ; GFX9: liveins: $sgpr0_sgpr1, $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     ; GFX10-LABEL: name: load_global_s32_from_sgpr_zext_vgpr
     ; GFX10: liveins: $sgpr0_sgpr1, $vgpr0
     ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:vgpr(s32) = COPY $vgpr0
@@ -81,13 +81,13 @@ body: |
     ; GFX9: liveins: $sgpr0_sgpr1, $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     ; GFX10-LABEL: name: load_global_s32_from_sgpr_merge_zext_vgpr
     ; GFX10: liveins: $sgpr0_sgpr1, $vgpr0
     ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:vgpr(s32) = COPY $vgpr0
@@ -125,7 +125,7 @@ body: |
     ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX9: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1
-    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ; GFX10-LABEL: name: load_global_s32_from_sgpr_merge_not_0_vgpr
     ; GFX10: liveins: $sgpr0_sgpr1, $vgpr0
@@ -141,7 +141,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX10: %12:vgpr_32, dead %14:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1
-    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:vgpr(s32) = COPY $vgpr0
@@ -169,7 +169,7 @@ body: |
     ; GFX9: liveins: $sgpr0_sgpr1, $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 4095, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 4095, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     ; GFX10-LABEL: name: load_global_s32_from_sgpr_zext_vgpr_offset4095
     ; GFX10: liveins: $sgpr0_sgpr1, $vgpr0
@@ -195,7 +195,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY7]], [[COPY8]], 0, implicit $exec
     ; GFX10: %14:vgpr_32, dead %16:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY9]], [[COPY10]], killed [[V_ADD_CO_U32_e64_3]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, %14, %subreg.sub1
-    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:vgpr(s32) = COPY $vgpr0
@@ -225,7 +225,7 @@ body: |
     ; GFX9: liveins: $sgpr0_sgpr1, $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], -4096, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], -4096, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     ; GFX10-LABEL: name: load_global_s32_from_sgpr_zext_vgpr_offset_neg4096
     ; GFX10: liveins: $sgpr0_sgpr1, $vgpr0
@@ -251,7 +251,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY7]], [[COPY8]], 0, implicit $exec
     ; GFX10: %14:vgpr_32, dead %16:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY9]], [[COPY10]], killed [[V_ADD_CO_U32_e64_3]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, %14, %subreg.sub1
-    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:vgpr(s32) = COPY $vgpr0
@@ -279,13 +279,13 @@ body: |
     ; GFX9: liveins: $sgpr0_sgpr1
     ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4096
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 4096
@@ -310,13 +310,13 @@ body: |
     ; GFX9: liveins: $sgpr0_sgpr1
     ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 1, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 1, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4097
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 1, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 1, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 4097
@@ -351,7 +351,7 @@ body: |
     ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4097
     ; GFX10: liveins: $sgpr0_sgpr1
@@ -367,7 +367,7 @@ body: |
     ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 -4097
@@ -392,13 +392,13 @@ body: |
     ; GFX9: liveins: $sgpr0_sgpr1
     ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 2049, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 2049, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_2049
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 1, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 1, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 2049
@@ -423,7 +423,7 @@ body: |
     ; GFX9: liveins: $sgpr0_sgpr1
     ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], -2049, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], -2049, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg2049
     ; GFX10: liveins: $sgpr0_sgpr1
@@ -439,7 +439,7 @@ body: |
     ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 -2049
@@ -463,13 +463,13 @@ body: |
     ; GFX9: liveins: $sgpr0_sgpr1
     ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 4095, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 4095, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4294967295
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 2047, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 2047, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 4294967295
@@ -503,7 +503,7 @@ body: |
     ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4294967296
     ; GFX10: liveins: $sgpr0_sgpr1
@@ -519,7 +519,7 @@ body: |
     ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 4294967296
@@ -554,7 +554,7 @@ body: |
     ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4294971390
     ; GFX10: liveins: $sgpr0_sgpr1
@@ -570,7 +570,7 @@ body: |
     ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 4294971390
@@ -605,7 +605,7 @@ body: |
     ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4294967295
     ; GFX10: liveins: $sgpr0_sgpr1
@@ -621,7 +621,7 @@ body: |
     ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 -4294967295
@@ -655,7 +655,7 @@ body: |
     ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4294967296
     ; GFX10: liveins: $sgpr0_sgpr1
@@ -671,7 +671,7 @@ body: |
     ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 -4294967296
@@ -693,12 +693,12 @@ body: |
     ; GFX9-LABEL: name: load_global_s32_from_copy_undef_sgpr
     ; GFX9: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ; GFX10-LABEL: name: load_global_s32_from_copy_undef_sgpr
     ; GFX10: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = G_IMPLICIT_DEF
     %1:vgpr(p1) = COPY %0
@@ -717,11 +717,11 @@ body: |
   bb.0:
     ; GFX9-LABEL: name: load_global_s32_from_undef_vgpr
     ; GFX9: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ; GFX10-LABEL: name: load_global_s32_from_undef_vgpr
     ; GFX10: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:vgpr(p1) = G_IMPLICIT_DEF
     %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 1)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
index 1871cfbc9ce7..ec7076cec4db 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
@@ -27,7 +27,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_4
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -37,27 +37,27 @@ body: |
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_4
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX8-LABEL: name: load_global_s32_from_4
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX9-LABEL: name: load_global_s32_from_4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ; GFX10-LABEL: name: load_global_s32_from_4
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 1)
@@ -84,7 +84,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6: [[BUFFER_LOAD_USHORT_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_USHORT_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 1)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_2
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -94,27 +94,27 @@ body: |
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7: [[BUFFER_LOAD_USHORT_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_USHORT_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_2
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
     ; GFX8-LABEL: name: load_global_s32_from_2
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
     ; GFX9-LABEL: name: load_global_s32_from_2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]]
     ; GFX10-LABEL: name: load_global_s32_from_2
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = G_LOAD %0 :: (load 2, align 2, addrspace 1)
@@ -141,7 +141,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -151,27 +151,27 @@ body: |
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_global_s32_from_1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_global_s32_from_1
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 1)
@@ -198,7 +198,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
     ; GFX6: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
     ; GFX7-LABEL: name: load_global_v2s32
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -208,27 +208,27 @@ body: |
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
     ; GFX7: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_global_v2s32
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
     ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX8-LABEL: name: load_global_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
     ; GFX8: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX9-LABEL: name: load_global_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
     ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
     ; GFX10-LABEL: name: load_global_v2s32
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
     ; GFX10: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load 8, align 8, addrspace 1)
@@ -255,7 +255,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1)
     ; GFX6: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
     ; GFX7-LABEL: name: load_global_v4s32
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -265,27 +265,27 @@ body: |
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1)
     ; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_global_v4s32
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1)
     ; GFX7-FLAT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
     ; GFX8-LABEL: name: load_global_v4s32
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1)
     ; GFX8: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
     ; GFX9-LABEL: name: load_global_v4s32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1)
     ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
     ; GFX10-LABEL: name: load_global_v4s32
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1)
     ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<4 x  s32>) = G_LOAD %0 :: (load 16, align 4, addrspace 1)
@@ -312,27 +312,27 @@ body: |
     ; GFX7-LABEL: name: load_global_s64
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
+    ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
     ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX7-FLAT-LABEL: name: load_global_s64
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
     ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX8-LABEL: name: load_global_s64
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
     ; GFX8: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX9-LABEL: name: load_global_s64
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
     ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
     ; GFX10-LABEL: name: load_global_s64
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
     ; GFX10: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_LOAD %0 :: (load 8, align 8, addrspace 1)
@@ -359,27 +359,27 @@ body: |
     ; GFX7-LABEL: name: load_global_v2s64
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1)
+    ; GFX7: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1)
     ; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
     ; GFX7-FLAT-LABEL: name: load_global_v2s64
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1)
     ; GFX7-FLAT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
     ; GFX8-LABEL: name: load_global_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1)
     ; GFX8: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
     ; GFX9-LABEL: name: load_global_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1)
     ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
     ; GFX10-LABEL: name: load_global_v2s64
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1)
     ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s64>) = G_LOAD %0 :: (load 16, align 4, addrspace 1)
@@ -500,27 +500,27 @@ body: |
     ; GFX7-LABEL: name: load_global_p3_from_4
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
+    ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX7-FLAT-LABEL: name: load_global_p3_from_4
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX8-LABEL: name: load_global_p3_from_4
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX9-LABEL: name: load_global_p3_from_4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ; GFX10-LABEL: name: load_global_p3_from_4
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = G_LOAD %0 :: (load 4, align 4, addrspace 1)
@@ -547,27 +547,27 @@ body: |
     ; GFX7-LABEL: name: load_global_p1_from_8
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
+    ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
     ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX7-FLAT-LABEL: name: load_global_p1_from_8
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
     ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX8-LABEL: name: load_global_p1_from_8
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
     ; GFX8: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX9-LABEL: name: load_global_p1_from_8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
     ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
     ; GFX10-LABEL: name: load_global_p1_from_8
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
     ; GFX10: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(p1) = G_LOAD %0 :: (load 8, align 8, addrspace 1)
@@ -688,27 +688,27 @@ body: |
     ; GFX7-LABEL: name: load_global_v2s16
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
+    ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX7-FLAT-LABEL: name: load_global_v2s16
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX8-LABEL: name: load_global_v2s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX9-LABEL: name: load_global_v2s16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ; GFX10-LABEL: name: load_global_v2s16
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load 4, align 4, addrspace 1)
@@ -735,27 +735,27 @@ body: |
     ; GFX7-LABEL: name: load_global_v4s16
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
+    ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
     ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX7-FLAT-LABEL: name: load_global_v4s16
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
     ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX8-LABEL: name: load_global_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
     ; GFX8: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX9-LABEL: name: load_global_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
     ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
     ; GFX10-LABEL: name: load_global_v4s16
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
     ; GFX10: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<4 x s16>) = G_LOAD %0 :: (load 8, align 8, addrspace 1)
@@ -833,7 +833,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_2047
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -843,7 +843,7 @@ body: |
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2047
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
@@ -858,7 +858,7 @@ body: |
     ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_2047
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -873,17 +873,17 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_2047
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_2047
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 2047
@@ -912,7 +912,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_2048
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -922,7 +922,7 @@ body: |
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2048
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
@@ -937,7 +937,7 @@ body: |
     ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_2048
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -952,12 +952,12 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_2048
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2048, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2048, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_2048
     ; GFX10: liveins: $vgpr0_vgpr1
@@ -972,7 +972,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 2048
@@ -1011,7 +1011,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -1031,7 +1031,7 @@ body: |
     ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
@@ -1046,7 +1046,7 @@ body: |
     ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -1061,17 +1061,17 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -2047
@@ -1110,7 +1110,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -1130,7 +1130,7 @@ body: |
     ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
@@ -1145,7 +1145,7 @@ body: |
     ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -1160,17 +1160,17 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -2048
@@ -1199,7 +1199,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_4095
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -1209,7 +1209,7 @@ body: |
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_4095
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
@@ -1224,7 +1224,7 @@ body: |
     ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_4095
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -1239,12 +1239,12 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_4095
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 4095, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 4095, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_4095
     ; GFX10: liveins: $vgpr0_vgpr1
@@ -1259,7 +1259,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 4095
@@ -1289,7 +1289,7 @@ body: |
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -1300,7 +1300,7 @@ body: |
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
@@ -1315,7 +1315,7 @@ body: |
     ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -1330,7 +1330,7 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX9: liveins: $vgpr0_vgpr1
@@ -1345,7 +1345,7 @@ body: |
     ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX10: liveins: $vgpr0_vgpr1
@@ -1360,7 +1360,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 4096
@@ -1399,7 +1399,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -1419,7 +1419,7 @@ body: |
     ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
@@ -1434,7 +1434,7 @@ body: |
     ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -1449,12 +1449,12 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4095, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4095, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX10: liveins: $vgpr0_vgpr1
@@ -1469,7 +1469,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -4095
@@ -1508,7 +1508,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -1528,7 +1528,7 @@ body: |
     ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
@@ -1543,7 +1543,7 @@ body: |
     ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -1558,12 +1558,12 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4096, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4096, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX10: liveins: $vgpr0_vgpr1
@@ -1578,7 +1578,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -4096
@@ -1608,7 +1608,7 @@ body: |
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8191
-    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -1619,7 +1619,7 @@ body: |
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8191
-    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
@@ -1634,7 +1634,7 @@ body: |
     ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -1649,7 +1649,7 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX9: liveins: $vgpr0_vgpr1
@@ -1664,7 +1664,7 @@ body: |
     ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX10: liveins: $vgpr0_vgpr1
@@ -1679,7 +1679,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 8191
@@ -1709,7 +1709,7 @@ body: |
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8192
-    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -1720,7 +1720,7 @@ body: |
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8192
-    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
@@ -1735,7 +1735,7 @@ body: |
     ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -1750,7 +1750,7 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX9: liveins: $vgpr0_vgpr1
@@ -1765,7 +1765,7 @@ body: |
     ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX10: liveins: $vgpr0_vgpr1
@@ -1780,7 +1780,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 8192
@@ -1819,7 +1819,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -1839,7 +1839,7 @@ body: |
     ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
@@ -1854,7 +1854,7 @@ body: |
     ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -1869,7 +1869,7 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX9: liveins: $vgpr0_vgpr1
@@ -1884,7 +1884,7 @@ body: |
     ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX10: liveins: $vgpr0_vgpr1
@@ -1899,7 +1899,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -8191
@@ -1938,7 +1938,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX7: liveins: $vgpr0_vgpr1
@@ -1958,7 +1958,7 @@ body: |
     ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
@@ -1973,7 +1973,7 @@ body: |
     ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX8: liveins: $vgpr0_vgpr1
@@ -1988,7 +1988,7 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
     ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX9: liveins: $vgpr0_vgpr1
@@ -2003,7 +2003,7 @@ body: |
     ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX10: liveins: $vgpr0_vgpr1
@@ -2018,7 +2018,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
     ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -8192

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.s96.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.s96.mir
index 75742e47e05e..ba20a503abef 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.s96.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.s96.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s
 # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7-FLAT %s
 # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s
@@ -23,27 +24,27 @@ body: |
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7: [[BUFFER_LOAD_DWORDX3_ADDR64_:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 12, align 4, addrspace 1)
+    ; GFX7: [[BUFFER_LOAD_DWORDX3_ADDR64_:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 12, align 4, addrspace 1)
     ; GFX7: $vgpr0_vgpr1_vgpr2 = COPY [[BUFFER_LOAD_DWORDX3_ADDR64_]]
     ; GFX7-FLAT-LABEL: name: load_global_v3s32
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4, addrspace 1)
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4, addrspace 1)
     ; GFX7-FLAT: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]]
     ; GFX8-LABEL: name: load_global_v3s32
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4, addrspace 1)
+    ; GFX8: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4, addrspace 1)
     ; GFX8: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]]
     ; GFX9-LABEL: name: load_global_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[GLOBAL_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = GLOBAL_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 12, align 4, addrspace 1)
+    ; GFX9: [[GLOBAL_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = GLOBAL_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 12, align 4, addrspace 1)
     ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[GLOBAL_LOAD_DWORDX3_]]
     ; GFX10-LABEL: name: load_global_v3s32
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10: [[GLOBAL_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = GLOBAL_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 12, align 4, addrspace 1)
+    ; GFX10: [[GLOBAL_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = GLOBAL_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 12, align 4, addrspace 1)
     ; GFX10: $vgpr0_vgpr1_vgpr2 = COPY [[GLOBAL_LOAD_DWORDX3_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<3 x  s32>) = G_LOAD %0 :: (load 12, align 4, addrspace 1)
@@ -134,4 +135,3 @@ body: |
     $vgpr0_vgpr1_vgpr2 = COPY %1
 
 ...
-

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
index 4758118a2255..77bbf8375f2f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
@@ -19,12 +19,12 @@ body: |
     ; GFX6-LABEL: name: load_private_s32_from_4
     ; GFX6: liveins: $vgpr0
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_4
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5)
@@ -49,12 +49,12 @@ body: |
     ; GFX6-LABEL: name: load_private_s32_from_2
     ; GFX6: liveins: $vgpr0
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_2
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_LOAD %0 :: (load 2, align 2, addrspace 5)
@@ -79,12 +79,12 @@ body: |
     ; GFX6-LABEL: name: load_private_s32_from_1
     ; GFX6: liveins: $vgpr0
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5)
@@ -109,12 +109,12 @@ body: |
     ; GFX6-LABEL: name: load_private_p3_from_4
     ; GFX6: liveins: $vgpr0
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
     ; GFX9-LABEL: name: load_private_p3_from_4
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(p3) = G_LOAD %0 :: (load 4, align 4, addrspace 5)
@@ -139,12 +139,12 @@ body: |
     ; GFX6-LABEL: name: load_private_p5_from_4
     ; GFX6: liveins: $vgpr0
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
     ; GFX9-LABEL: name: load_private_p5_from_4
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(p5) = G_LOAD %0 :: (load 4, align 4, addrspace 5)
@@ -170,12 +170,12 @@ body: |
     ; GFX6-LABEL: name: load_private_v2s16
     ; GFX6: liveins: $vgpr0
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
     ; GFX9-LABEL: name: load_private_v2s16
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load 4, align 4, addrspace 5)
@@ -206,12 +206,12 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_2047
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 2047
@@ -240,14 +240,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec
     ; GFX6: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_2047_known_bits
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec
     ; GFX9: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 2147483647
@@ -279,12 +279,12 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_2048
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 2048
@@ -313,14 +313,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2047, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2047
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2047, implicit $exec
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -2047
@@ -349,14 +349,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2048, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2048
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2048, implicit $exec
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -2048
@@ -385,12 +385,12 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_4095
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 4095
@@ -419,14 +419,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_4096
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 4096
@@ -455,14 +455,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4095, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4095
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4095, implicit $exec
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -4095
@@ -491,14 +491,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4096, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4096
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4096, implicit $exec
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -4096
@@ -527,14 +527,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_8191
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 8191
@@ -563,14 +563,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_8192
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 8192
@@ -599,14 +599,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8191, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8191
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8191, implicit $exec
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -8191
@@ -635,14 +635,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8192, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8192
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8192, implicit $exec
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -8192
@@ -666,10 +666,10 @@ body: |
   bb.0:
 
     ; GFX6-LABEL: name: load_private_s32_from_4_constant_0
-    ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
     ; GFX9-LABEL: name: load_private_s32_from_4_constant_0
-    ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
     %0:vgpr(p5) = G_CONSTANT i32 0
     %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5)
@@ -691,10 +691,10 @@ body: |
   bb.0:
 
     ; GFX6-LABEL: name: load_private_s32_from_4_constant_sgpr_16
-    ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
     ; GFX9-LABEL: name: load_private_s32_from_4_constant_sgpr_16
-    ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
     %0:sgpr(p5) = G_CONSTANT i32 16
     %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5)
@@ -716,10 +716,10 @@ body: |
   bb.0:
 
     ; GFX6-LABEL: name: load_private_s32_from_1_constant_4095
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFSET]]
     ; GFX9-LABEL: name: load_private_s32_from_1_constant_4095
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFSET]]
     %0:vgpr(p5) = G_CONSTANT i32 4095
     %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5)
@@ -742,11 +742,11 @@ body: |
 
     ; GFX6-LABEL: name: load_private_s32_from_1_constant_4096
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_constant_4096
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = G_CONSTANT i32 4096
     %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5)
@@ -770,10 +770,10 @@ body: |
   bb.0:
 
     ; GFX6-LABEL: name: load_private_s32_from_fi
-    ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_fi
-    ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
     %0:vgpr(p5) = G_FRAME_INDEX %stack.0
     %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5)
@@ -797,10 +797,10 @@ body: |
   bb.0:
 
     ; GFX6-LABEL: name: load_private_s32_from_1_fi_offset_4095
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_4095
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = G_FRAME_INDEX %stack.0
     %1:vgpr(s32) = G_CONSTANT i32 4095
@@ -829,13 +829,13 @@ body: |
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_4096
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = G_FRAME_INDEX %stack.0
     %1:vgpr(s32) = G_CONSTANT i32 4096
@@ -861,11 +861,11 @@ body: |
 
     ; GFX6-LABEL: name: load_private_s32_from_neg1
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, -1, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_neg1
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, -1, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
     %0:vgpr(p5) = G_CONSTANT i32 -1
     %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir
index 2af468ea9baa..0bb46b364eaf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir
@@ -18,16 +18,16 @@ body: |
     ; WAVE64: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4
     ; WAVE64: [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; WAVE64: [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 [[COPY1]], 0, 0, implicit $mode, implicit $exec
-    ; WAVE64: FLAT_STORE_DWORD [[COPY2]], [[V_CVT_F32_I32_e64_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; WAVE64: FLAT_STORE_DWORD [[COPY2]], [[V_CVT_F32_I32_e64_1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; WAVE64: FLAT_STORE_DWORD [[COPY2]], [[V_CVT_F32_I32_e64_]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; WAVE64: FLAT_STORE_DWORD [[COPY2]], [[V_CVT_F32_I32_e64_1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; WAVE32-LABEL: name: sitofp
     ; WAVE32: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4
     ; WAVE32: [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; WAVE32: [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 [[COPY1]], 0, 0, implicit $mode, implicit $exec
-    ; WAVE32: GLOBAL_STORE_DWORD [[COPY2]], [[V_CVT_F32_I32_e64_]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
-    ; WAVE32: GLOBAL_STORE_DWORD [[COPY2]], [[V_CVT_F32_I32_e64_1]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    ; WAVE32: GLOBAL_STORE_DWORD [[COPY2]], [[V_CVT_F32_I32_e64_]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    ; WAVE32: GLOBAL_STORE_DWORD [[COPY2]], [[V_CVT_F32_I32_e64_1]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     %0:sgpr(s32) = COPY $sgpr0
 
     %1:vgpr(s32) = COPY $vgpr0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir
index 379936c640e7..83e455b9af5b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir
@@ -17,12 +17,12 @@ body: |
     ; GFX7: liveins: $vgpr0, $vgpr1_vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
-    ; GFX7: FLAT_STORE_DWORD [[COPY1]], [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst 4)
+    ; GFX7: FLAT_STORE_DWORD [[COPY1]], [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst 4)
     ; GFX9-LABEL: name: atomic_store_flat_s32_seq_cst
     ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
-    ; GFX9: FLAT_STORE_DWORD [[COPY1]], [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst 4)
+    ; GFX9: FLAT_STORE_DWORD [[COPY1]], [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst 4)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p0) = COPY $vgpr1_vgpr2
     G_STORE %0, %1 :: (store seq_cst 4, align 4, addrspace 0)
@@ -152,12 +152,12 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7: FLAT_STORE_DWORDX2 [[COPY1]], [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst 8)
+    ; GFX7: FLAT_STORE_DWORDX2 [[COPY1]], [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst 8)
     ; GFX9-LABEL: name: atomic_store_flat_s64_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9: FLAT_STORE_DWORDX2 [[COPY1]], [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst 8)
+    ; GFX9: FLAT_STORE_DWORDX2 [[COPY1]], [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst 8)
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(p0) = COPY $vgpr2_vgpr3
     G_STORE %0, %1 :: (store seq_cst 8, align 8, addrspace 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
index 143f17cc5bc3..5b53dbea6b23 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
@@ -19,22 +19,22 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     ; GFX8-LABEL: name: store_flat_s32_to_4
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     ; GFX9-LABEL: name: store_flat_s32_to_4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    ; GFX9: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     ; GFX10-LABEL: name: store_flat_s32_to_4
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    ; GFX10: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     G_STORE %1, %0 :: (store 4, align 4, addrspace 0)
@@ -55,22 +55,22 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2)
+    ; GFX7: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2)
     ; GFX8-LABEL: name: store_flat_s32_to_2
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX8: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2)
+    ; GFX8: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2)
     ; GFX9-LABEL: name: store_flat_s32_to_2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2)
+    ; GFX9: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2)
     ; GFX10-LABEL: name: store_flat_s32_to_2
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2)
+    ; GFX10: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     G_STORE %1, %0 :: (store 2, align 2, addrspace 0)
@@ -91,22 +91,22 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1)
+    ; GFX7: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1)
     ; GFX8-LABEL: name: store_flat_s32_to_1
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX8: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1)
+    ; GFX8: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1)
     ; GFX9-LABEL: name: store_flat_s32_to_1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1)
+    ; GFX9: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1)
     ; GFX10-LABEL: name: store_flat_s32_to_1
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1)
+    ; GFX10: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     G_STORE %1, %0 :: (store 1, align 1, addrspace 0)
@@ -128,22 +128,22 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
+    ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
     ; GFX8-LABEL: name: store_flat_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
+    ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
     ; GFX9-LABEL: name: store_flat_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
+    ; GFX9: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
     ; GFX10-LABEL: name: store_flat_s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
+    ; GFX10: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store 8, align 8, addrspace 0)
@@ -237,22 +237,22 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
+    ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
     ; GFX8-LABEL: name: store_flat_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
+    ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
     ; GFX9-LABEL: name: store_flat_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
+    ; GFX9: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
     ; GFX10-LABEL: name: store_flat_v2s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
+    ; GFX10: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s32>) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store 8, align 8, addrspace 0)
@@ -273,22 +273,22 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4
-    ; GFX7: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16)
+    ; GFX7: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16)
     ; GFX8-LABEL: name: store_flat_v3s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4
-    ; GFX8: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16)
+    ; GFX8: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16)
     ; GFX9-LABEL: name: store_flat_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4
-    ; GFX9: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16)
+    ; GFX9: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16)
     ; GFX10-LABEL: name: store_flat_v3s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4
-    ; GFX10: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16)
+    ; GFX10: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4
     G_STORE %1, %0 :: (store 12, align 16, addrspace 0)
@@ -309,22 +309,22 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX7: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16)
+    ; GFX7: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16)
     ; GFX8-LABEL: name: store_flat_v4s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX8: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16)
+    ; GFX8: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16)
     ; GFX9-LABEL: name: store_flat_v4s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX9: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16)
+    ; GFX9: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16)
     ; GFX10-LABEL: name: store_flat_v4s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX10: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16)
+    ; GFX10: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<4 x s32>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     G_STORE %1, %0 :: (store 16, align 16, addrspace 0)
@@ -346,22 +346,22 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     ; GFX8-LABEL: name: store_flat_v2s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     ; GFX9-LABEL: name: store_flat_v2s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    ; GFX9: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     ; GFX10-LABEL: name: store_flat_v2s16
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    ; GFX10: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s16>) = COPY $vgpr2
     G_STORE %1, %0 :: (store 4, align 4, addrspace 0)
@@ -383,22 +383,22 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
+    ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
     ; GFX8-LABEL: name: store_flat_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
+    ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
     ; GFX9-LABEL: name: store_flat_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
+    ; GFX9: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
     ; GFX10-LABEL: name: store_flat_v4s16
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
+    ; GFX10: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<4 x s16>) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store 8, align 8, addrspace 0)
@@ -493,22 +493,22 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX7: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16)
+    ; GFX7: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16)
     ; GFX8-LABEL: name: store_flat_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX8: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16)
+    ; GFX8: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16)
     ; GFX9-LABEL: name: store_flat_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX9: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16)
+    ; GFX9: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16)
     ; GFX10-LABEL: name: store_flat_v2s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX10: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16)
+    ; GFX10: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     G_STORE %1, %0 :: (store 16, align 16, addrspace 0)
@@ -530,22 +530,22 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
+    ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
     ; GFX8-LABEL: name: store_flat_p1
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
+    ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
     ; GFX9-LABEL: name: store_flat_p1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
+    ; GFX9: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
     ; GFX10-LABEL: name: store_flat_p1
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
+    ; GFX10: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(p1) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store 8, align 8, addrspace 0)
@@ -604,22 +604,22 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     ; GFX8-LABEL: name: store_flat_p3
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     ; GFX9-LABEL: name: store_flat_p3
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    ; GFX9: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     ; GFX10-LABEL: name: store_flat_p3
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    ; GFX10: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = COPY $vgpr2
     G_STORE %1, %0 :: (store 4, align 4, addrspace 0)
@@ -677,22 +677,22 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4)
+    ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4)
     ; GFX8-LABEL: name: store_atomic_flat_s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4)
+    ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4)
     ; GFX9-LABEL: name: store_atomic_flat_s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4)
+    ; GFX9: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4)
     ; GFX10-LABEL: name: store_atomic_flat_s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4)
+    ; GFX10: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     G_STORE %1, %0 :: (store monotonic 4, align 4, addrspace 0)
@@ -714,22 +714,22 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8)
+    ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8)
     ; GFX8-LABEL: name: store_atomic_flat_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8)
+    ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8)
     ; GFX9-LABEL: name: store_atomic_flat_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8)
+    ; GFX9: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8)
     ; GFX10-LABEL: name: store_atomic_flat_s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8)
+    ; GFX10: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store monotonic 8, align 8, addrspace 0)
@@ -761,7 +761,7 @@ body: |
     ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    ; GFX7: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     ; GFX8-LABEL: name: store_flat_s32_gep_2047
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -776,12 +776,12 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    ; GFX8: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     ; GFX9-LABEL: name: store_flat_s32_gep_2047
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    ; GFX9: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     ; GFX10-LABEL: name: store_flat_s32_gep_2047
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -796,7 +796,7 @@ body: |
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX10: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    ; GFX10: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 2047

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
index 740894a5a747..009353d52bd6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
@@ -26,7 +26,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    ; GFX6: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     ; GFX7-LABEL: name: store_global_s32_to_4
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -36,27 +36,27 @@ body: |
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    ; GFX7: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     ; GFX7-FLAT-LABEL: name: store_global_s32_to_4
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-FLAT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7-FLAT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX8-LABEL: name: store_global_s32_to_4
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX9-LABEL: name: store_global_s32_to_4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    ; GFX9: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     ; GFX10-LABEL: name: store_global_s32_to_4
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    ; GFX10: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     G_STORE %1, %0 :: (store 4, align 4, addrspace 1)
@@ -82,7 +82,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6: BUFFER_STORE_SHORT_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 1)
+    ; GFX6: BUFFER_STORE_SHORT_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 1)
     ; GFX7-LABEL: name: store_global_s32_to_2
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -92,27 +92,27 @@ body: |
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7: BUFFER_STORE_SHORT_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 1)
+    ; GFX7: BUFFER_STORE_SHORT_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 1)
     ; GFX7-FLAT-LABEL: name: store_global_s32_to_2
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-FLAT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2, addrspace 1)
+    ; GFX7-FLAT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2, addrspace 1)
     ; GFX8-LABEL: name: store_global_s32_to_2
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX8: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2, addrspace 1)
+    ; GFX8: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2, addrspace 1)
     ; GFX9-LABEL: name: store_global_s32_to_2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: GLOBAL_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 1)
+    ; GFX9: GLOBAL_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 1)
     ; GFX10-LABEL: name: store_global_s32_to_2
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: GLOBAL_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 1)
+    ; GFX10: GLOBAL_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     G_STORE %1, %0 :: (store 2, align 2, addrspace 1)
@@ -138,7 +138,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6: BUFFER_STORE_BYTE_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 1)
+    ; GFX6: BUFFER_STORE_BYTE_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 1)
     ; GFX7-LABEL: name: store_global_s32_to_1
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -148,27 +148,27 @@ body: |
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7: BUFFER_STORE_BYTE_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 1)
+    ; GFX7: BUFFER_STORE_BYTE_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 1)
     ; GFX7-FLAT-LABEL: name: store_global_s32_to_1
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-FLAT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1, addrspace 1)
+    ; GFX7-FLAT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1, addrspace 1)
     ; GFX8-LABEL: name: store_global_s32_to_1
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX8: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1, addrspace 1)
+    ; GFX8: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1, addrspace 1)
     ; GFX9-LABEL: name: store_global_s32_to_1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: GLOBAL_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 1)
+    ; GFX9: GLOBAL_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 1)
     ; GFX10-LABEL: name: store_global_s32_to_1
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: GLOBAL_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 1)
+    ; GFX10: GLOBAL_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     G_STORE %1, %0 :: (store 1, align 1, addrspace 1)
@@ -195,27 +195,27 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
     ; GFX7-FLAT-LABEL: name: store_global_s64
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
+    ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
     ; GFX8-LABEL: name: store_global_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
+    ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
     ; GFX9-LABEL: name: store_global_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
+    ; GFX9: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
     ; GFX10-LABEL: name: store_global_s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
+    ; GFX10: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store 8, align 8, addrspace 1)
@@ -288,7 +288,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
+    ; GFX6: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
     ; GFX7-LABEL: name: store_global_v2s32
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -298,27 +298,27 @@ body: |
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
+    ; GFX7: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
     ; GFX7-FLAT-LABEL: name: store_global_v2s32
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
+    ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
     ; GFX8-LABEL: name: store_global_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
+    ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
     ; GFX9-LABEL: name: store_global_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
+    ; GFX9: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
     ; GFX10-LABEL: name: store_global_v2s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
+    ; GFX10: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s32>) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store 8, align 8, addrspace 1)
@@ -344,7 +344,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; GFX6: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     ; GFX7-LABEL: name: store_global_v4s32
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -354,27 +354,27 @@ body: |
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; GFX7: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     ; GFX7-FLAT-LABEL: name: store_global_v4s32
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX7-FLAT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1)
+    ; GFX7-FLAT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1)
     ; GFX8-LABEL: name: store_global_v4s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX8: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1)
+    ; GFX8: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1)
     ; GFX9-LABEL: name: store_global_v4s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX9: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; GFX9: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     ; GFX10-LABEL: name: store_global_v4s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX10: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; GFX10: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<4 x s32>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     G_STORE %1, %0 :: (store 16, align 16, addrspace 1)
@@ -401,27 +401,27 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7-FLAT-LABEL: name: store_global_v2s16
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-FLAT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7-FLAT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX8-LABEL: name: store_global_v2s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX9-LABEL: name: store_global_v2s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    ; GFX9: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     ; GFX10-LABEL: name: store_global_v2s16
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    ; GFX10: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s16>) = COPY $vgpr2
     G_STORE %1, %0 :: (store 4, align 4, addrspace 1)
@@ -448,27 +448,27 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
     ; GFX7-FLAT-LABEL: name: store_global_v4s16
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
+    ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
     ; GFX8-LABEL: name: store_global_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
+    ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
     ; GFX9-LABEL: name: store_global_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
+    ; GFX9: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
     ; GFX10-LABEL: name: store_global_v4s16
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
+    ; GFX10: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<4 x s16>) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store 8, align 8, addrspace 1)
@@ -542,27 +542,27 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX7: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1)
     ; GFX7-FLAT-LABEL: name: store_global_v2s64
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX7-FLAT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1)
+    ; GFX7-FLAT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1)
     ; GFX8-LABEL: name: store_global_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX8: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1)
+    ; GFX8: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1)
     ; GFX9-LABEL: name: store_global_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX9: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; GFX9: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     ; GFX10-LABEL: name: store_global_v2s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX10: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; GFX10: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     G_STORE %1, %0 :: (store 16, align 16, addrspace 1)
@@ -589,27 +589,27 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
     ; GFX7-FLAT-LABEL: name: store_global_p1
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
+    ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
     ; GFX8-LABEL: name: store_global_p1
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
+    ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
     ; GFX9-LABEL: name: store_global_p1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
+    ; GFX9: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
     ; GFX10-LABEL: name: store_global_p1
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
+    ; GFX10: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(p1) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store 8, align 8, addrspace 1)
@@ -683,27 +683,27 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7-FLAT-LABEL: name: store_global_p3
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-FLAT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7-FLAT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX8-LABEL: name: store_global_p3
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX9-LABEL: name: store_global_p3
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    ; GFX9: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     ; GFX10-LABEL: name: store_global_p3
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    ; GFX10: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = COPY $vgpr2
     G_STORE %1, %0 :: (store 4, align 4, addrspace 1)
@@ -776,27 +776,27 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4, addrspace 1)
     ; GFX7-FLAT-LABEL: name: store_atomic_global_s32
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-FLAT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4, addrspace 1)
+    ; GFX7-FLAT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4, addrspace 1)
     ; GFX8-LABEL: name: store_atomic_global_s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4, addrspace 1)
+    ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4, addrspace 1)
     ; GFX9-LABEL: name: store_atomic_global_s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store monotonic 4, addrspace 1)
+    ; GFX9: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store monotonic 4, addrspace 1)
     ; GFX10-LABEL: name: store_atomic_global_s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store monotonic 4, addrspace 1)
+    ; GFX10: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store monotonic 4, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     G_STORE %1, %0 :: (store monotonic 4, align 4, addrspace 1)
@@ -823,27 +823,27 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8, addrspace 1)
+    ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8, addrspace 1)
     ; GFX7-FLAT-LABEL: name: store_atomic_global_s64
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8, addrspace 1)
+    ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8, addrspace 1)
     ; GFX8-LABEL: name: store_atomic_global_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8, addrspace 1)
+    ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8, addrspace 1)
     ; GFX9-LABEL: name: store_atomic_global_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX9: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store monotonic 8, addrspace 1)
+    ; GFX9: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store monotonic 8, addrspace 1)
     ; GFX10-LABEL: name: store_atomic_global_s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store monotonic 8, addrspace 1)
+    ; GFX10: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store monotonic 8, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store monotonic 8, align 8, addrspace 1)
@@ -870,7 +870,7 @@ body: |
     ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    ; GFX6: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     ; GFX7-LABEL: name: store_global_s32_gep_2047
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -880,7 +880,7 @@ body: |
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    ; GFX7: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     ; GFX7-FLAT-LABEL: name: store_global_s32_gep_2047
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -895,7 +895,7 @@ body: |
     ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7-FLAT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7-FLAT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX8-LABEL: name: store_global_s32_gep_2047
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -910,17 +910,17 @@ body: |
     ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX8: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX8: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX9-LABEL: name: store_global_s32_gep_2047
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    ; GFX9: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     ; GFX10-LABEL: name: store_global_s32_gep_2047
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    ; GFX10: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 2047

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.s96.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.s96.mir
index 3f00d0a92593..34038925fe4f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.s96.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.s96.mir
@@ -26,27 +26,27 @@ body: |
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7: BUFFER_STORE_DWORDX3_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 12, align 16, addrspace 1)
+    ; GFX7: BUFFER_STORE_DWORDX3_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 12, align 16, addrspace 1)
     ; GFX7-FLAT-LABEL: name: store_global_v3s32
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4
-    ; GFX7-FLAT: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16, addrspace 1)
+    ; GFX7-FLAT: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16, addrspace 1)
     ; GFX8-LABEL: name: store_global_v3s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4
-    ; GFX8: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16, addrspace 1)
+    ; GFX8: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16, addrspace 1)
     ; GFX9-LABEL: name: store_global_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4
-    ; GFX9: GLOBAL_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 12, align 16, addrspace 1)
+    ; GFX9: GLOBAL_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 12, align 16, addrspace 1)
     ; GFX10-LABEL: name: store_global_v3s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4
-    ; GFX10: GLOBAL_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 12, align 16, addrspace 1)
+    ; GFX10: GLOBAL_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 12, align 16, addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4
     G_STORE %1, %0 :: (store 12, align 16, addrspace 1)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
index c5312f8ada51..992a76795bf9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
@@ -21,12 +21,12 @@ body: |
     ; GFX6: liveins: $vgpr0, $vgpr1
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
     ; GFX9-LABEL: name: function_store_private_s32_to_4
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store 4, align 4, addrspace 5)
@@ -52,12 +52,12 @@ body: |
     ; GFX6: liveins: $vgpr0, $vgpr1
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5)
+    ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5)
     ; GFX9-LABEL: name: function_store_private_s32_to_2
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5)
+    ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store 2, align 2, addrspace 5)
@@ -83,12 +83,12 @@ body: |
     ; GFX6: liveins: $vgpr0, $vgpr1
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     ; GFX9-LABEL: name: function_store_private_s32_to_1
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store 1, align 1, addrspace 5)
@@ -114,12 +114,12 @@ body: |
     ; GFX6: liveins: $vgpr0, $vgpr1
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
     ; GFX9-LABEL: name: function_store_private_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
     %0:vgpr(<2 x s16>) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store 4, align 4, addrspace 5)
@@ -145,12 +145,12 @@ body: |
     ; GFX6: liveins: $vgpr0, $vgpr1
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
     ; GFX9-LABEL: name: function_store_private_p3
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store 4, align 4, addrspace 5)
@@ -176,12 +176,12 @@ body: |
     ; GFX6: liveins: $vgpr0, $vgpr1
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
     ; GFX9-LABEL: name: function_store_private_p5
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store 4, align 4, addrspace 5)
@@ -206,10 +206,10 @@ body: |
 
     ; GFX6-LABEL: name: function_store_private_s32_to_1_fi_offset_4095
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     ; GFX9-LABEL: name: function_store_private_s32_to_1_fi_offset_4095
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     %0:vgpr(p5) = G_FRAME_INDEX %stack.0
     %1:vgpr(s32) = G_CONSTANT i32 4095
     %2:vgpr(p5) = G_PTR_ADD %0, %1
@@ -236,10 +236,10 @@ body: |
 
     ; GFX6-LABEL: name: function_store_private_s32_to_1_constant_4095
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     ; GFX9-LABEL: name: function_store_private_s32_to_1_constant_4095
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     %0:vgpr(p5) = G_CONSTANT i32 4095
     %1:vgpr(s32) = G_CONSTANT i32 0
     G_STORE %1, %0 :: (store 1, align 1, addrspace 5)
@@ -265,11 +265,11 @@ body: |
     ; GFX6-LABEL: name: function_store_private_s32_to_1_constant_4096
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     ; GFX9-LABEL: name: function_store_private_s32_to_1_constant_4096
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     %0:vgpr(p5) = G_CONSTANT i32 4096
     %1:vgpr(s32) = G_CONSTANT i32 0
     G_STORE %1, %0 :: (store 1, align 1, addrspace 5)
@@ -294,12 +294,12 @@ body: |
     ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
     ; GFX9-LABEL: name: kernel_store_private_s32_to_4
     ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store 4, align 4, addrspace 5)
@@ -324,12 +324,12 @@ body: |
     ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5)
+    ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5)
     ; GFX9-LABEL: name: kernel_store_private_s32_to_2
     ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5)
+    ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store 2, align 2, addrspace 5)
@@ -354,12 +354,12 @@ body: |
     ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     ; GFX9-LABEL: name: kernel_store_private_s32_to_1
     ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store 1, align 1, addrspace 5)
@@ -384,12 +384,12 @@ body: |
     ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
     ; GFX9-LABEL: name: kernel_store_private_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
     %0:vgpr(<2 x s16>) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store 4, align 4, addrspace 5)
@@ -414,12 +414,12 @@ body: |
     ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
     ; GFX9-LABEL: name: kernel_store_private_p3
     ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store 4, align 4, addrspace 5)
@@ -444,12 +444,12 @@ body: |
     ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
     ; GFX9-LABEL: name: kernel_store_private_p5
     ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store 4, align 4, addrspace 5)
@@ -475,11 +475,11 @@ body: |
     ; GFX6-LABEL: name: kernel_store_private_s32_to_1_fi_offset_4095
     ; GFX6: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     ; GFX9-LABEL: name: kernel_store_private_s32_to_1_fi_offset_4095
     ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     %0:vgpr(p5) = G_FRAME_INDEX %stack.0
     %1:vgpr(s32) = G_CONSTANT i32 4095
     %2:vgpr(p5) = G_PTR_ADD %0, %1
@@ -507,11 +507,11 @@ body: |
     ; GFX6-LABEL: name: kernel_store_private_s32_to_1_constant_4095
     ; GFX6: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     ; GFX9-LABEL: name: kernel_store_private_s32_to_1_constant_4095
     ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     %0:vgpr(p5) = G_CONSTANT i32 4095
     %1:vgpr(s32) = G_CONSTANT i32 0
     G_STORE %1, %0 :: (store 1, align 1, addrspace 5)
@@ -538,12 +538,12 @@ body: |
     ; GFX6: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     ; GFX9-LABEL: name: kernel_store_private_s32_to_1_constant_4096
     ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     %0:vgpr(p5) = G_CONSTANT i32 4096
     %1:vgpr(s32) = G_CONSTANT i32 0
     G_STORE %1, %0 :: (store 1, align 1, addrspace 5)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll
index 22e944fc3a11..b4bca0349fdc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll
@@ -1,10 +1,21 @@
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
 ; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908
 
-declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) #0
+declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float)
+declare <2 x half> @llvm.amdgcn.global.atomic.fadd.f32.p1v2f16.v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>)
 
 ; GFX908: error: {{.*}} return versions of fp atomics not supported
 
+; GFX90A-LABEL: {{^}}global_atomic_fadd_f32_rtn:
+; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc
 define float @global_atomic_fadd_f32_rtn(float addrspace(1)* %ptr, float %data) {
   %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
   ret float %ret
 }
+
+; GFX90A-LABEL: {{^}}global_atomic_fadd_v2f16_rtn:
+; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc
+define <2 x half> @global_atomic_fadd_v2f16_rtn(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
+  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.f32.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
+  ret <2 x half> %ret
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
index 4f0c1586cad1..f129da7b346c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
 
 define void @global_atomic_fadd_f32(float addrspace(1)* %ptr, float %data) {
 ; GFX908-LABEL: global_atomic_fadd_f32:
@@ -8,6 +9,13 @@ define void @global_atomic_fadd_f32(float addrspace(1)* %ptr, float %data) {
 ; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_atomic_fadd_f32:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
   %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
   ret void
 }
@@ -19,6 +27,13 @@ define void @global_atomic_fadd_f32_off_2048(float addrspace(1)* %ptr, float %da
 ; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2048
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_atomic_fadd_f32_off_2048:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2048 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, float addrspace(1)* %ptr, i64 512
   %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data)
   ret void
@@ -31,6 +46,13 @@ define void @global_atomic_fadd_f32_off_neg2047(float addrspace(1)* %ptr, float
 ; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:-2044
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_atomic_fadd_f32_off_neg2047:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:-2044 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, float addrspace(1)* %ptr, i64 -511
   %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data)
   ret void
@@ -47,6 +69,16 @@ define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(float addrspace(1)* %pt
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2048
 ; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: global_atomic_fadd_f32_off_ss:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v[0:1], v2, off offset:2048 glc
+; GFX90A-NEXT:    s_endpgm
   %gep = getelementptr float, float addrspace(1)* %ptr, i64 512
   %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data)
   ret void
@@ -59,6 +91,13 @@ define void @global_atomic_fadd_v2f16(<2 x half> addrspace(1)* %ptr, <2 x half>
 ; GFX908-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_atomic_fadd_v2f16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
   ret void
 }
@@ -70,6 +109,13 @@ define void @global_atomic_fadd_v2f16_off_neg2047(<2 x half> addrspace(1)* %ptr,
 ; GFX908-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:-2044
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_atomic_fadd_v2f16_off_neg2047:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2044 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -511
   %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %gep, <2 x half> %data)
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll
new file mode 100644
index 000000000000..257785fc6bff
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll
@@ -0,0 +1,25 @@
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
+; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908
+
+declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg)
+declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg)
+
+; GFX908: error: {{.*}} return versions of fp atomics not supported
+
+; GFX90A-LABEL: {{^}}buffer_atomic_add_f32_rtn:
+; GFX90A: buffer_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}} offen glc
+define amdgpu_kernel void @buffer_atomic_add_f32_rtn(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 %soffset) {
+main_body:
+  %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+  store float %ret, float* undef
+  ret void
+}
+
+; GFX90A-LABEL: {{^}}buffer_atomic_add_v2f16_rtn:
+; GFX90A: buffer_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}} offen glc
+define amdgpu_kernel void @buffer_atomic_add_v2f16_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+main_body:
+  %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+  store <2 x half> %ret, <2 x half>* undef
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll
index d7e87a7b2863..d3d4569243df 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll
@@ -1,185 +1,329 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX908
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX90A
 
 ; Natural mapping
 define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
-  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:   S_ENDPGM 0
+  ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+  ; GFX908:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX908:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX908:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX90A: bb.1 (%ir-block.0):
+  ; GFX90A:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+  ; GFX90A:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX90A:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
 
 define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
-  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
-  ; CHECK:   S_ENDPGM 0
+  ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+  ; GFX908:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX908:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; GFX908:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset
+  ; GFX90A: bb.1 (%ir-block.0):
+  ; GFX90A:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+  ; GFX90A:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; GFX90A:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
   ret void
 }
 
 define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
-  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
-  ; CHECK:   S_ENDPGM 0
+  ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+  ; GFX908:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; GFX908:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset
+  ; GFX90A: bb.1 (%ir-block.0):
+  ; GFX90A:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+  ; GFX90A:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; GFX90A:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
 }
 
 ; Natural mapping, no voffset
 define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
-  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:   S_ENDPGM 0
+  ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+  ; GFX908:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX908:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset
+  ; GFX90A: bb.1 (%ir-block.0):
+  ; GFX90A:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+  ; GFX90A:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX90A:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
   ret void
 }
 
 ; All operands need regbank legalization
 define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float inreg %val, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) {
-  ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
-  ; CHECK:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
-  ; CHECK:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
-  ; CHECK:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
-  ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
-  ; CHECK: bb.2:
-  ; CHECK:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
-  ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec
-  ; CHECK:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
-  ; CHECK:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec
-  ; CHECK:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec
-  ; CHECK:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec
-  ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
-  ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec
-  ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
-  ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
-  ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
-  ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
-  ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-  ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
-  ; CHECK: bb.3:
-  ; CHECK:   successors: %bb.4(0x80000000)
-  ; CHECK:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
-  ; CHECK: bb.4:
-  ; CHECK:   S_ENDPGM 0
+  ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908:   successors: %bb.2(0x80000000)
+  ; GFX908:   liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX908:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX908:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX908:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX908:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX908:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX908:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+  ; GFX908:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+  ; GFX908:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+  ; GFX908:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+  ; GFX908:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
+  ; GFX908: bb.2:
+  ; GFX908:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; GFX908:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec
+  ; GFX908:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec
+  ; GFX908:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+  ; GFX908:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec
+  ; GFX908:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec
+  ; GFX908:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec
+  ; GFX908:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+  ; GFX908:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec
+  ; GFX908:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
+  ; GFX908:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX908:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; GFX908:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+  ; GFX908:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
+  ; GFX908:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX908:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX908:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; GFX908:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
+  ; GFX908: bb.3:
+  ; GFX908:   successors: %bb.4(0x80000000)
+  ; GFX908:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
+  ; GFX908: bb.4:
+  ; GFX908:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; GFX90A: bb.1 (%ir-block.0):
+  ; GFX90A:   successors: %bb.2(0x80000000)
+  ; GFX90A:   liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX90A:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX90A:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX90A:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX90A:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+  ; GFX90A:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+  ; GFX90A:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+  ; GFX90A:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+  ; GFX90A:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
+  ; GFX90A: bb.2:
+  ; GFX90A:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; GFX90A:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec
+  ; GFX90A:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec
+  ; GFX90A:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+  ; GFX90A:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec
+  ; GFX90A:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec
+  ; GFX90A:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec
+  ; GFX90A:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+  ; GFX90A:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec
+  ; GFX90A:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
+  ; GFX90A:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX90A:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; GFX90A:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+  ; GFX90A:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
+  ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX90A:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; GFX90A:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
+  ; GFX90A: bb.3:
+  ; GFX90A:   successors: %bb.4(0x80000000)
+  ; GFX90A:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
+  ; GFX90A: bb.4:
+  ; GFX90A:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
 
 ; All operands need regbank legalization, no voffset
 define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset(float inreg %val, <4 x i32> %rsrc, i32 %soffset) {
-  ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
-  ; CHECK:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; CHECK:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
-  ; CHECK:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
-  ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
-  ; CHECK: bb.2:
-  ; CHECK:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
-  ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec
-  ; CHECK:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
-  ; CHECK:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec
-  ; CHECK:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec
-  ; CHECK:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec
-  ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
-  ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
-  ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
-  ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
-  ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
-  ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
-  ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-  ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
-  ; CHECK: bb.3:
-  ; CHECK:   successors: %bb.4(0x80000000)
-  ; CHECK:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
-  ; CHECK: bb.4:
-  ; CHECK:   S_ENDPGM 0
+  ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908:   successors: %bb.2(0x80000000)
+  ; GFX908:   liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX908:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX908:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX908:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX908:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+  ; GFX908:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+  ; GFX908:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+  ; GFX908:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
+  ; GFX908: bb.2:
+  ; GFX908:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; GFX908:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec
+  ; GFX908:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec
+  ; GFX908:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+  ; GFX908:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec
+  ; GFX908:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec
+  ; GFX908:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec
+  ; GFX908:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+  ; GFX908:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
+  ; GFX908:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
+  ; GFX908:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX908:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; GFX908:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
+  ; GFX908:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
+  ; GFX908:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX908:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX908:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; GFX908:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
+  ; GFX908: bb.3:
+  ; GFX908:   successors: %bb.4(0x80000000)
+  ; GFX908:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
+  ; GFX908: bb.4:
+  ; GFX908:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset
+  ; GFX90A: bb.1 (%ir-block.0):
+  ; GFX90A:   successors: %bb.2(0x80000000)
+  ; GFX90A:   liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX90A:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX90A:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+  ; GFX90A:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+  ; GFX90A:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+  ; GFX90A:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
+  ; GFX90A: bb.2:
+  ; GFX90A:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; GFX90A:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec
+  ; GFX90A:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec
+  ; GFX90A:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+  ; GFX90A:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec
+  ; GFX90A:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec
+  ; GFX90A:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec
+  ; GFX90A:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+  ; GFX90A:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
+  ; GFX90A:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
+  ; GFX90A:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX90A:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; GFX90A:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
+  ; GFX90A:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
+  ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX90A:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; GFX90A:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
+  ; GFX90A: bb.3:
+  ; GFX90A:   successors: %bb.4(0x80000000)
+  ; GFX90A:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
+  ; GFX90A: bb.4:
+  ; GFX90A:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
   ret void
 }
 
 define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095(float %val, <4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
-  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
-  ; CHECK:   S_ENDPGM 0
+  ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+  ; GFX908:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX908:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; GFX908:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
+  ; GFX90A: bb.1 (%ir-block.0):
+  ; GFX90A:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+  ; GFX90A:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; GFX90A:   S_ENDPGM 0
   %voffset = add i32 %voffset.base, 4095
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -187,54 +331,92 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
 
 ; Natural mapping + slc
 define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
-  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:   S_ENDPGM 0
+  ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+  ; GFX908:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX908:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX908:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
+  ; GFX90A: bb.1 (%ir-block.0):
+  ; GFX90A:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+  ; GFX90A:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX90A:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
   ret void
 }
 
 define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
-  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:   S_ENDPGM 0
+  ; GFX908-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+  ; GFX908:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX908:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX908:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX90A: bb.1 (%ir-block.0):
+  ; GFX90A:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+  ; GFX90A:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX90A:   S_ENDPGM 0
   %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
 
 define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
-  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:   S_ENDPGM 0
+  ; GFX908-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+  ; GFX908:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX908:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset
+  ; GFX90A: bb.1 (%ir-block.0):
+  ; GFX90A:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+  ; GFX90A:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX90A:   S_ENDPGM 0
   %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll
index 24bab342ab89..1343cb3cd791 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll
@@ -14,7 +14,7 @@ define amdgpu_ps half @raw_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_OFFEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; UNPACKED-LABEL: name: raw_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
@@ -27,7 +27,7 @@ define amdgpu_ps half @raw_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]]
   ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -45,7 +45,7 @@ define amdgpu_ps <2 x half> @raw_buffer_load_format_v2f16__sgpr_rsrc__vgpr_voffs
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_OFFEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; UNPACKED-LABEL: name: raw_buffer_load_format_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
@@ -58,7 +58,7 @@ define amdgpu_ps <2 x half> @raw_buffer_load_format_v2f16__sgpr_rsrc__vgpr_voffs
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub0
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub1
   ; UNPACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
@@ -93,7 +93,7 @@ define amdgpu_ps <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffs
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0
   ; PACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1
   ; PACKED:   $vgpr0 = COPY [[COPY6]]
@@ -109,7 +109,7 @@ define amdgpu_ps <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffs
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub0
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub1
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub2
@@ -169,7 +169,7 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr
   ; PACKED:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
   ; PACKED:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
   ; PACKED:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -209,7 +209,7 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr
   ; UNPACKED:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
   ; UNPACKED:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -234,7 +234,7 @@ define amdgpu_ps <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffs
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0
   ; PACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1
   ; PACKED:   $vgpr0 = COPY [[COPY6]]
@@ -250,7 +250,7 @@ define amdgpu_ps <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffs
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub0
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub1
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
index 3674d4065744..6285932fc751 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
@@ -13,7 +13,7 @@ define amdgpu_ps float @raw_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgp
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -31,7 +31,7 @@ define amdgpu_ps <2 x float> @raw_buffer_load_format_v2f32__sgpr_rsrc__vgpr_voff
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_OFFEN]].sub1
   ; CHECK:   $vgpr0 = COPY [[COPY6]]
@@ -52,7 +52,7 @@ define amdgpu_ps <3 x float> @raw_buffer_load_format_v3f32__sgpr_rsrc__vgpr_voff
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub1
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub2
@@ -75,7 +75,7 @@ define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voff
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2
@@ -121,7 +121,7 @@ define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgp
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -146,7 +146,7 @@ define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voff
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll
index bd83f73976ff..b5d0b1725be0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll
@@ -14,7 +14,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -34,7 +34,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr7
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -69,7 +69,7 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -114,7 +114,7 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffs
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -140,7 +140,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1)
@@ -159,7 +159,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
@@ -178,7 +178,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4)
@@ -197,7 +197,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 1, 0, 1, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 1, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6)
@@ -216,7 +216,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5)
@@ -235,7 +235,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 1, 0, 1, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 1, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7)
@@ -254,7 +254,7 @@ define amdgpu_ps <2 x float> @raw_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sg
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1
   ; CHECK:   $vgpr0 = COPY [[COPY6]]
@@ -275,7 +275,7 @@ define amdgpu_ps <3 x float> @raw_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sg
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_DWORDX3_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORDX3_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub1
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub2
@@ -298,7 +298,7 @@ define amdgpu_ps <4 x float> @raw_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sg
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2
@@ -323,7 +323,7 @@ define amdgpu_ps half @raw_buffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffse
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -341,7 +341,7 @@ define amdgpu_ps <2 x half> @raw_buffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgp
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -365,7 +365,7 @@ define amdgpu_ps <4 x half> @raw_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgp
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1
   ; CHECK:   $vgpr0 = COPY [[COPY6]]
@@ -386,7 +386,7 @@ define amdgpu_ps float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffse
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -406,7 +406,7 @@ define amdgpu_ps float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffse
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4)
   ; CHECK:   [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_OFFEN]], 0, 8, implicit $exec
   ; CHECK:   $vgpr0 = COPY [[V_BFE_I32_e64_]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -444,7 +444,7 @@ define amdgpu_ps half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffse
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -486,7 +486,7 @@ define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffse
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -512,7 +512,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
@@ -529,7 +529,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[COPY4]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[COPY4]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
@@ -548,7 +548,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4096, align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4096, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0)
@@ -566,7 +566,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 16, align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 16, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %voffset = add i32 %voffset.base, 16
@@ -585,7 +585,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %voffset = add i32 %voffset.base, 4095
@@ -607,7 +607,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   %10:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4096, align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4096, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %voffset = add i32 %voffset.base, 4096
@@ -626,7 +626,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
@@ -644,7 +644,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
@@ -664,7 +664,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
   ; CHECK:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %soffset = add i32 %soffset.base, 16
@@ -685,7 +685,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
   ; CHECK:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %soffset = add i32 %soffset.base, 4095
@@ -706,7 +706,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %soffset = add i32 %soffset.base, 4096
@@ -744,7 +744,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -790,7 +790,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %13, [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 5000, align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %13, [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 5000, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
index 54f1e8adcdc2..b084e93ab65d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
@@ -14,7 +14,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -27,7 +27,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -44,7 +44,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_gfx80_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_gfx80_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource" + 4095, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_soffset_f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -56,7 +56,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource" + 4095, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
@@ -78,7 +78,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -91,7 +91,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -116,7 +116,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -131,7 +131,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -173,7 +173,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; UNPACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -211,7 +211,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec
   ; PACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; PACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -240,7 +240,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4095
   ; PACKED: bb.1 (%ir-block.0):
@@ -253,7 +253,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
   ret void
@@ -275,7 +275,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4096
   ; PACKED: bb.1 (%ir-block.0):
@@ -288,7 +288,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
   ret void
@@ -310,7 +310,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_16
   ; PACKED: bb.1 (%ir-block.0):
@@ -323,7 +323,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -346,7 +346,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4095
   ; PACKED: bb.1 (%ir-block.0):
@@ -359,7 +359,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -385,7 +385,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4096
   ; PACKED: bb.1 (%ir-block.0):
@@ -401,7 +401,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; PACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; PACKED:   %11:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -448,7 +448,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY12]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; UNPACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 4096, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 4096, align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -489,7 +489,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY10]], implicit $exec
   ; PACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; PACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 4096, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 4096, align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
index eed3f119983c..762708fbac94 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
@@ -14,7 +14,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -31,7 +31,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
@@ -51,7 +51,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -72,7 +72,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -94,7 +94,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -132,7 +132,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -159,7 +159,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
   ret void
@@ -179,7 +179,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
   ret void
@@ -199,7 +199,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 16, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -220,7 +220,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -244,7 +244,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   %13:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -288,7 +288,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY12]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %15, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %15, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 4096, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
index 36ac0b295238..c6160e342036 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
@@ -15,7 +15,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -36,7 +36,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -71,7 +71,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -103,7 +103,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY6]], implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -148,7 +148,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -173,7 +173,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1)
   ret void
@@ -191,7 +191,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
   ret void
@@ -209,7 +209,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 3)
   ret void
@@ -227,7 +227,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4)
   ret void
@@ -245,7 +245,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6)
   ret void
@@ -263,7 +263,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5)
   ret void
@@ -281,7 +281,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7)
   ret void
@@ -301,7 +301,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -322,7 +322,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
-  ; CHECK:   BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -344,7 +344,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -362,7 +362,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
+  ; CHECK:   BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %val.trunc = trunc i32 %val to i8
   call void @llvm.amdgcn.raw.buffer.store.i8(i8 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -381,7 +381,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %val.trunc = trunc i32 %val to i16
   call void @llvm.amdgcn.raw.buffer.store.i16(i16 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -400,7 +400,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -418,7 +418,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -438,7 +438,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -474,7 +474,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -498,7 +498,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
@@ -517,7 +517,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0)
   ret void
@@ -535,7 +535,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -554,7 +554,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -576,7 +576,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   %11:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -595,7 +595,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
   ret void
@@ -613,7 +613,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
   ret void
@@ -631,7 +631,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -650,7 +650,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -672,7 +672,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   %11:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -711,7 +711,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %13, [[REG_SEQUENCE3]], [[COPY6]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 5000, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %13, [[REG_SEQUENCE3]], [[COPY6]], 904, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 5000, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -755,7 +755,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 5000, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 5000, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll
index 135e14a8a219..e0effb3c9a16 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll
@@ -13,7 +13,7 @@ define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]]
   ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; PACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
@@ -26,7 +26,7 @@ define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0)
@@ -44,7 +44,7 @@ define amdgpu_ps <2 x half> @raw_tbuffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sg
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub0
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub1
   ; UNPACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
@@ -68,7 +68,7 @@ define amdgpu_ps <2 x half> @raw_tbuffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sg
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_OFFEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0)
@@ -92,7 +92,7 @@ define amdgpu_ps <4 x half> @raw_tbuffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sg
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub0
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub1
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub2
@@ -126,7 +126,7 @@ define amdgpu_ps <4 x half> @raw_tbuffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sg
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0
   ; PACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1
   ; PACKED:   $vgpr0 = COPY [[COPY6]]
@@ -167,7 +167,7 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs
   ; UNPACKED:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
   ; UNPACKED:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -207,7 +207,7 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs
   ; PACKED:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
   ; PACKED:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
   ; PACKED:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -232,7 +232,7 @@ define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]]
   ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; PACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc
@@ -245,7 +245,7 @@ define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1)
@@ -263,7 +263,7 @@ define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]]
   ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; PACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
@@ -276,7 +276,7 @@ define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2)
@@ -294,7 +294,7 @@ define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]]
   ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; PACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc
@@ -307,7 +307,7 @@ define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3)
@@ -325,7 +325,7 @@ define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]]
   ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; PACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc
@@ -338,7 +338,7 @@ define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll
index 74575afd543f..9237be3f843e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll
@@ -12,7 +12,7 @@ define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0)
@@ -30,7 +30,7 @@ define amdgpu_ps <2 x float> @raw_tbuffer_load_v2f32__sgpr_rsrc__vgpr_voffset__s
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[TBUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[TBUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub1
   ; CHECK:   $vgpr0 = COPY [[COPY6]]
@@ -51,7 +51,7 @@ define amdgpu_ps <3 x float> @raw_tbuffer_load_v3f32__sgpr_rsrc__vgpr_voffset__s
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub1
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub2
@@ -74,7 +74,7 @@ define amdgpu_ps <4 x float> @raw_tbuffer_load_v4f32__sgpr_rsrc__vgpr_voffset__s
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2
@@ -119,7 +119,7 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
   ; CHECK:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U32_e64_]], [[S_AND_B32_]], implicit-def $scc
-  ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -144,7 +144,7 @@ define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1)
@@ -162,7 +162,7 @@ define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2)
@@ -180,7 +180,7 @@ define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3)
@@ -198,7 +198,7 @@ define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll
index b20ab8bfc4d8..1487ec3e0b28 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll
@@ -14,7 +14,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
   ; PACKED: bb.1 (%ir-block.0):
@@ -27,7 +27,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; PACKED:   TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0)
   ret void
@@ -49,7 +49,7 @@ define amdgpu_ps void @raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_so
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
   ; PACKED: bb.1 (%ir-block.0):
@@ -62,7 +62,7 @@ define amdgpu_ps void @raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_so
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; PACKED:   TBUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   TBUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0)
   ret void
@@ -93,7 +93,7 @@ define amdgpu_ps void @raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_so
   ; UNPACKED:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY1]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3
-  ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
   ; PACKED: bb.1 (%ir-block.0):
@@ -108,7 +108,7 @@ define amdgpu_ps void @raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_so
   ; PACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
-  ; PACKED:   TBUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   TBUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0)
   ret void
@@ -143,7 +143,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff
   ; UNPACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; UNPACKED:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -179,7 +179,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff
   ; PACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; PACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; PACKED:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; PACKED:   TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -224,7 +224,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff
   ; UNPACKED:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; UNPACKED:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -263,7 +263,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff
   ; PACKED:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; PACKED:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; PACKED:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; PACKED:   TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -309,7 +309,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff
   ; UNPACKED:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; UNPACKED:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -349,7 +349,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff
   ; PACKED:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; PACKED:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; PACKED:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; PACKED:   TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -374,7 +374,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc
   ; PACKED: bb.1 (%ir-block.0):
@@ -387,7 +387,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; PACKED:   TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1)
   ret void
@@ -405,7 +405,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
   ; PACKED: bb.1 (%ir-block.0):
@@ -418,7 +418,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; PACKED:   TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2)
   ret void
@@ -436,7 +436,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc
   ; PACKED: bb.1 (%ir-block.0):
@@ -449,7 +449,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; PACKED:   TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3)
   ret void
@@ -467,7 +467,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc
   ; PACKED: bb.1 (%ir-block.0):
@@ -480,7 +480,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; PACKED:   TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4)
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll
index b499a190acf0..ab83fb3387a5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll
@@ -14,7 +14,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; UNPACKED:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
+  ; UNPACKED:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_tbuffer_store_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset
   ; PACKED: bb.1 (%ir-block.0):
@@ -27,7 +27,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; PACKED:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
+  ; PACKED:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0)
   ret void
@@ -62,7 +62,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; UNPACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; UNPACKED:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; UNPACKED:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
+  ; UNPACKED:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -98,7 +98,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; PACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; PACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; PACKED:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; PACKED:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
+  ; PACKED:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -143,7 +143,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs
   ; UNPACKED:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; UNPACKED:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; UNPACKED:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
+  ; UNPACKED:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -182,7 +182,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs
   ; PACKED:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; PACKED:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; PACKED:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; PACKED:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
+  ; PACKED:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -228,7 +228,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs
   ; UNPACKED:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; UNPACKED:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; UNPACKED:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
+  ; UNPACKED:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -268,7 +268,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs
   ; PACKED:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; PACKED:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; PACKED:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; PACKED:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
+  ; PACKED:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll
index 8e31ac0eb24a..d0cf50a9062c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll
@@ -14,7 +14,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0)
   ret void
@@ -35,7 +35,7 @@ define amdgpu_ps void @raw_tbuffer_store_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_so
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
-  ; CHECK:   TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0)
   ret void
@@ -57,7 +57,7 @@ define amdgpu_ps void @raw_tbuffer_store_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_so
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3
-  ; CHECK:   TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE]], [[COPY7]], [[REG_SEQUENCE1]], [[COPY8]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE]], [[COPY7]], [[REG_SEQUENCE1]], [[COPY8]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0)
   ret void
@@ -80,7 +80,7 @@ define amdgpu_ps void @raw_tbuffer_store_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_so
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK:   TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY8]], [[REG_SEQUENCE1]], [[COPY9]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY8]], [[REG_SEQUENCE1]], [[COPY9]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0)
   ret void
@@ -100,7 +100,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__sgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr7
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0)
   ret void
@@ -135,7 +135,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; CHECK:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -180,7 +180,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; CHECK:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U32_e64_]], [[S_AND_B32_]], implicit-def $scc
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -226,7 +226,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; CHECK:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U32_e64_]], [[S_AND_B32_]], implicit-def $scc
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -252,7 +252,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1)
   ret void
@@ -271,7 +271,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2)
   ret void
@@ -290,7 +290,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3)
   ret void
@@ -309,7 +309,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4)
   ret void
@@ -328,7 +328,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vdpr_voffset__sgpr_soff
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 94, i32 0)
   ret void
@@ -345,7 +345,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 94, i32 0)
   ret void
@@ -364,7 +364,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 94, i32 0)
   ret void
@@ -382,7 +382,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset = add i32 %voffset.base, 16
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0)
@@ -401,7 +401,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset = add i32 %voffset.base, 4095
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0)
@@ -423,7 +423,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset = add i32 %voffset.base, 4096
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0)
@@ -442,7 +442,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 94, i32 0)
   ret void
@@ -460,7 +460,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 94, i32 0)
   ret void
@@ -480,7 +480,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
   ; CHECK:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def $scc
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %soffset = add i32 %soffset.base, 16
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0)
@@ -501,7 +501,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
   ; CHECK:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def $scc
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %soffset = add i32 %soffset.base, 4095
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0)
@@ -522,7 +522,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def $scc
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %soffset = add i32 %soffset.base, 4096
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0)
@@ -560,7 +560,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; CHECK:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -606,7 +606,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec
   ; CHECK:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE3]], [[COPY6]], 904, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 5000, align 1, addrspace 4)
+  ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE3]], [[COPY6]], 904, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 5000, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
index 43d796883233..ea73e6255245 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
@@ -1491,7 +1491,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(<4 x i32> inreg %rsrc, i32
   ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX6:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX6:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset
@@ -1504,7 +1504,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(<4 x i32> inreg %rsrc, i32
   ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX7:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX7:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; GFX8-LABEL: name: s_buffer_load_f32_vgpr_offset
@@ -1517,7 +1517,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(<4 x i32> inreg %rsrc, i32
   ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX8:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX8:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
@@ -1535,7 +1535,7 @@ define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX6:   [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 8, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 8, align 4)
   ; GFX6:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0
   ; GFX6:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1
   ; GFX6:   $vgpr0 = COPY [[COPY5]]
@@ -1551,7 +1551,7 @@ define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX7:   [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 8, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 8, align 4)
   ; GFX7:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0
   ; GFX7:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1
   ; GFX7:   $vgpr0 = COPY [[COPY5]]
@@ -1567,7 +1567,7 @@ define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX8:   [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 8, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 8, align 4)
   ; GFX8:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0
   ; GFX8:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1
   ; GFX8:   $vgpr0 = COPY [[COPY5]]
@@ -1588,7 +1588,7 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX6:   [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
   ; GFX6:   [[COPY5:%[0-9]+]]:vreg_128 = COPY [[DEF]]
   ; GFX6:   [[COPY6:%[0-9]+]]:vreg_128 = COPY [[DEF]]
@@ -1611,7 +1611,7 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX7:   [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
   ; GFX7:   [[COPY5:%[0-9]+]]:vreg_128 = COPY [[DEF]]
   ; GFX7:   [[COPY6:%[0-9]+]]:vreg_128 = COPY [[DEF]]
@@ -1634,7 +1634,7 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX8:   [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
   ; GFX8:   [[COPY5:%[0-9]+]]:vreg_128 = COPY [[DEF]]
   ; GFX8:   [[COPY6:%[0-9]+]]:vreg_128 = COPY [[DEF]]
@@ -1662,7 +1662,7 @@ define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX6:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0
   ; GFX6:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1
   ; GFX6:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2
@@ -1682,7 +1682,7 @@ define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX7:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0
   ; GFX7:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1
   ; GFX7:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2
@@ -1702,7 +1702,7 @@ define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX8:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0
   ; GFX8:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1
   ; GFX8:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2
@@ -1727,8 +1727,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX6:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7
   ; GFX6:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
   ; GFX6:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
@@ -1757,8 +1757,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX7:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7
   ; GFX7:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
   ; GFX7:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
@@ -1787,8 +1787,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX8:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7
   ; GFX8:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
   ; GFX8:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
@@ -1822,10 +1822,10 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(<4 x i32> inreg
   ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4)
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4)
   ; GFX6:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15
   ; GFX6:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
   ; GFX6:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
@@ -1870,10 +1870,10 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(<4 x i32> inreg
   ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4)
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4)
   ; GFX7:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15
   ; GFX7:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
   ; GFX7:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
@@ -1918,10 +1918,10 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(<4 x i32> inreg
   ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4)
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4)
   ; GFX8:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15
   ; GFX8:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
   ; GFX8:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
@@ -1971,7 +1971,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(<4 x i32> inreg %
   ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX6:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX6:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4092
@@ -1984,7 +1984,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(<4 x i32> inreg %
   ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX7:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX7:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; GFX8-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4092
@@ -1997,7 +1997,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(<4 x i32> inreg %
   ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX8:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX8:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %soffset = add i32 %soffset.base, 4092
@@ -2016,7 +2016,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(<4 x i32> inreg %
   ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX6:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX6:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4095
@@ -2029,7 +2029,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(<4 x i32> inreg %
   ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX7:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX7:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; GFX8-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4095
@@ -2042,7 +2042,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(<4 x i32> inreg %
   ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX8:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX8:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %soffset = add i32 %soffset.base, 4095
@@ -2061,7 +2061,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(<4 x i32> inreg %
   ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX6:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX6:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096
@@ -2074,7 +2074,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(<4 x i32> inreg %
   ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX7:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX7:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; GFX8-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096
@@ -2087,7 +2087,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(<4 x i32> inreg %
   ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX8:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX8:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %soffset = add i32 %soffset.base, 4096
@@ -2107,8 +2107,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(<4 x i32>
   ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX6:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7
   ; GFX6:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
   ; GFX6:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
@@ -2137,8 +2137,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(<4 x i32>
   ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX7:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7
   ; GFX7:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
   ; GFX7:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
@@ -2167,8 +2167,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(<4 x i32>
   ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX8:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7
   ; GFX8:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
   ; GFX8:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
@@ -2204,8 +2204,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(<4 x i32>
   ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX6:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7
   ; GFX6:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
   ; GFX6:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
@@ -2234,8 +2234,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(<4 x i32>
   ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX7:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7
   ; GFX7:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
   ; GFX7:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
@@ -2264,8 +2264,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(<4 x i32>
   ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX8:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7
   ; GFX8:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
   ; GFX8:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
@@ -2300,10 +2300,10 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(<4 x i3
   ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4)
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4)
   ; GFX6:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15
   ; GFX6:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
   ; GFX6:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
@@ -2348,10 +2348,10 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(<4 x i3
   ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4)
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4)
   ; GFX7:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15
   ; GFX7:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
   ; GFX7:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
@@ -2396,10 +2396,10 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(<4 x i3
   ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4)
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4)
   ; GFX8:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15
   ; GFX8:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
   ; GFX8:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
@@ -2450,10 +2450,10 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(<4 x i3
   ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4036
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4)
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4)
   ; GFX6:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15
   ; GFX6:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
   ; GFX6:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
@@ -2498,10 +2498,10 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(<4 x i3
   ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4036
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4)
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4)
   ; GFX7:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15
   ; GFX7:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
   ; GFX7:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
@@ -2546,10 +2546,10 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(<4 x i3
   ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4)
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4)
   ; GFX8:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15
   ; GFX8:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
   ; GFX8:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
@@ -2617,7 +2617,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg %
   ; GFX6:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; GFX6:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX6:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX6:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX6:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX6:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -2652,7 +2652,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg %
   ; GFX7:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; GFX7:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX7:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX7:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX7:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX7:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -2687,7 +2687,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg %
   ; GFX8:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; GFX8:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX8:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX8:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX8:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX8:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -2726,7 +2726,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> %
   ; GFX6:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec
   ; GFX6:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX6:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4092, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4092, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX6:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX6:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX6:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -2759,7 +2759,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> %
   ; GFX7:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec
   ; GFX7:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX7:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4092, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4092, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX7:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX7:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX7:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -2792,7 +2792,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> %
   ; GFX8:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec
   ; GFX8:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX8:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4092, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4092, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX8:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX8:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX8:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -2836,7 +2836,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> %
   ; GFX6:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; GFX6:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX6:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX6:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX6:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX6:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -2873,7 +2873,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> %
   ; GFX7:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; GFX7:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX7:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX7:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX7:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX7:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -2910,7 +2910,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> %
   ; GFX8:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; GFX8:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX8:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX8:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX8:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX8:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -2950,7 +2950,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc)
   ; GFX6:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY5]], implicit $exec
   ; GFX6:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX6:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4 + 4095, align 1)
+  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4 + 4095, align 1)
   ; GFX6:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX6:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX6:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -2983,7 +2983,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc)
   ; GFX7:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY5]], implicit $exec
   ; GFX7:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX7:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4 + 4095, align 1)
+  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4 + 4095, align 1)
   ; GFX7:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX7:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX7:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -3016,7 +3016,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc)
   ; GFX8:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY5]], implicit $exec
   ; GFX8:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX8:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4 + 4095, align 1)
+  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4 + 4095, align 1)
   ; GFX8:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX8:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX8:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -3057,7 +3057,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc)
   ; GFX6:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec
   ; GFX6:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX6:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX6:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX6:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX6:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -3092,7 +3092,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc)
   ; GFX7:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec
   ; GFX7:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX7:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX7:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX7:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX7:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -3125,7 +3125,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc)
   ; GFX8:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY5]], implicit $exec
   ; GFX8:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX8:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4 + 4096)
+  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4 + 4096)
   ; GFX8:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX8:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX8:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -3165,8 +3165,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
   ; GFX6:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec
   ; GFX6:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX6:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX6:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX6:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX6:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -3215,8 +3215,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
   ; GFX7:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec
   ; GFX7:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX7:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX7:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX7:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX7:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -3265,8 +3265,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
   ; GFX8:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec
   ; GFX8:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX8:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX8:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX8:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX8:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -3327,8 +3327,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
   ; GFX6:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; GFX6:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX6:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX6:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX6:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX6:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -3381,8 +3381,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
   ; GFX7:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; GFX7:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX7:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX7:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX7:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX7:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -3435,8 +3435,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
   ; GFX8:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; GFX8:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX8:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX8:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX8:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX8:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -3495,8 +3495,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
   ; GFX6:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; GFX6:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX6:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX6:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX6:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX6:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -3549,8 +3549,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
   ; GFX7:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; GFX7:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX7:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX7:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX7:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX7:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -3603,8 +3603,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
   ; GFX8:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; GFX8:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX8:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX8:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX8:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX8:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -3660,8 +3660,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
   ; GFX6:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec
   ; GFX6:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX6:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX6:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX6:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX6:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -3711,8 +3711,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
   ; GFX7:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec
   ; GFX7:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX7:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX7:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX7:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX7:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -3762,8 +3762,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
   ; GFX8:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec
   ; GFX8:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX8:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 936, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 952, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 936, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 952, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX8:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX8:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX8:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -3819,8 +3819,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
   ; GFX6:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec
   ; GFX6:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX6:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX6:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX6:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX6:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -3870,8 +3870,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
   ; GFX7:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec
   ; GFX7:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX7:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX7:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX7:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX7:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -3921,8 +3921,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
   ; GFX8:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec
   ; GFX8:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX8:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX8:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX8:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX8:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -3978,8 +3978,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
   ; GFX6:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec
   ; GFX6:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX6:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX6:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX6:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX6:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -4029,8 +4029,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
   ; GFX7:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec
   ; GFX7:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX7:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX7:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX7:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX7:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -4080,8 +4080,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
   ; GFX8:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec
   ; GFX8:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX8:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX8:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX8:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX8:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -4136,8 +4136,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4
   ; GFX6:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY5]], implicit $exec
   ; GFX6:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX6:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4)
-  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4)
   ; GFX6:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX6:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX6:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -4186,8 +4186,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4
   ; GFX7:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY5]], implicit $exec
   ; GFX7:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX7:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4)
-  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4)
   ; GFX7:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX7:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX7:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -4236,8 +4236,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4
   ; GFX8:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY5]], implicit $exec
   ; GFX8:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; GFX8:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4)
-  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4)
   ; GFX8:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX8:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; GFX8:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -4277,7 +4277,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(<4 x i32> inreg %
   ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX6:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX6:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr
@@ -4290,7 +4290,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(<4 x i32> inreg %
   ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX7:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX7:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; GFX8-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr
@@ -4303,7 +4303,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(<4 x i32> inreg %
   ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX8:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX8:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %offset = add i32 %offset.v, %offset.s
@@ -4322,7 +4322,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(<4 x i32> inreg %
   ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX6:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX6:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr
@@ -4335,7 +4335,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(<4 x i32> inreg %
   ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX7:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX7:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; GFX8-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr
@@ -4348,7 +4348,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(<4 x i32> inreg %
   ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX8:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX8:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %offset = add i32 %offset.s, %offset.v
@@ -4370,7 +4370,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inr
   ; GFX6:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; GFX6:   %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX6:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX6:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr_imm
@@ -4386,7 +4386,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inr
   ; GFX7:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; GFX7:   %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX7:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX7:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; GFX8-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr_imm
@@ -4402,7 +4402,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inr
   ; GFX8:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; GFX8:   %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX8:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX8:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %offset.base = add i32 %offset.v, %offset.s
@@ -4425,7 +4425,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inr
   ; GFX6:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; GFX6:   %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX6:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX6:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr_imm
@@ -4441,7 +4441,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inr
   ; GFX7:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; GFX7:   %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX7:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX7:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; GFX8-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr_imm
@@ -4457,7 +4457,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inr
   ; GFX8:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; GFX8:   %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX8:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX8:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %offset.base = add i32 %offset.s, %offset.v
@@ -4480,7 +4480,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
   ; GFX6:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc
-  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX6:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX6:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr
@@ -4495,7 +4495,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
   ; GFX7:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc
-  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX7:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX7:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; GFX8-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr
@@ -4510,7 +4510,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
   ; GFX8:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc
-  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX8:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX8:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %offset.base = add i32 %offset.s, 1024
@@ -4533,7 +4533,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
   ; GFX6:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GFX6:   %10:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
-  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX6:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX6:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr
@@ -4549,7 +4549,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
   ; GFX7:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GFX7:   %10:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
-  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX7:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX7:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; GFX8-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr
@@ -4565,7 +4565,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
   ; GFX8:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GFX8:   %10:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
-  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+  ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX8:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX8:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %offset.base = add i32 %offset.v, 1024

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll
index 99dde6c4d583..3a31fad29010 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll
@@ -1,11 +1,25 @@
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
 ; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908
 
-declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0
+declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg)
+declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg)
 
 ; GFX908: error: {{.*}} return versions of fp atomics not supported
 
-define amdgpu_ps float @buffer_atomic_add_f32_rtn(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX90A-LABEL: {{^}}buffer_atomic_add_f32_rtn:
+; GFX90A: buffer_atomic_add_f32 v{{[0-9]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}], s{{[0-9]+}} idxen offen glc
+define amdgpu_kernel void @buffer_atomic_add_f32_rtn(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) {
 main_body:
   %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
-  ret float %ret
+  store float %ret, float* undef
+  ret void
+}
+
+; GFX90A-LABEL: {{^}}buffer_atomic_add_v2f16_rtn:
+; GFX90A: buffer_atomic_pk_add_f16 v{{[0-9]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}], s{{[0-9]+}} idxen offen glc
+define amdgpu_kernel void @buffer_atomic_add_v2f16_rtn(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) {
+main_body:
+  %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+  store <2 x half> %ret, <2 x half>* undef
+  ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll
index 936af0f5a618..dbea49cb876e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll
@@ -1,255 +1,453 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX908
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX90A
 
 ; Natural mapping
 define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
-  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; CHECK:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:   S_ENDPGM 0
+  ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+  ; GFX908:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX908:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX908:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX908:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX908:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX90A: bb.1 (%ir-block.0):
+  ; GFX90A:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+  ; GFX90A:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX90A:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
 
 define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
-  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; CHECK:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
-  ; CHECK:   S_ENDPGM 0
+  ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+  ; GFX908:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX908:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX908:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX908:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; GFX908:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset
+  ; GFX90A: bb.1 (%ir-block.0):
+  ; GFX90A:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+  ; GFX90A:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; GFX90A:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
   ret void
 }
 
 define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
-  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
-  ; CHECK:   S_ENDPGM 0
+  ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+  ; GFX908:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX908:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; GFX908:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset
+  ; GFX90A: bb.1 (%ir-block.0):
+  ; GFX90A:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+  ; GFX90A:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; GFX90A:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0)
   ret void
 }
 
 ; Natural mapping, no voffset
 define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
-  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:   S_ENDPGM 0
+  ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+  ; GFX908:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX908:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX908:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset
+  ; GFX90A: bb.1 (%ir-block.0):
+  ; GFX90A:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+  ; GFX90A:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX90A:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
   ret void
 }
 
 ; All register operands need legalization
 define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float inreg %val, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) {
-  ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
-  ; CHECK:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; CHECK:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
-  ; CHECK:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
-  ; CHECK:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
-  ; CHECK:   [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
-  ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
-  ; CHECK: bb.2:
-  ; CHECK:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
-  ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec
-  ; CHECK:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
-  ; CHECK:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY11]], implicit $exec
-  ; CHECK:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub0, implicit $exec
-  ; CHECK:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub1, implicit $exec
-  ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
-  ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY12]], implicit $exec
-  ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
-  ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
-  ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
-  ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
-  ; CHECK:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
-  ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-  ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
-  ; CHECK: bb.3:
-  ; CHECK:   successors: %bb.4(0x80000000)
-  ; CHECK:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
-  ; CHECK: bb.4:
-  ; CHECK:   S_ENDPGM 0
+  ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908:   successors: %bb.2(0x80000000)
+  ; GFX908:   liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX908:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX908:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX908:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX908:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX908:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX908:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX908:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+  ; GFX908:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+  ; GFX908:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+  ; GFX908:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+  ; GFX908:   [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+  ; GFX908:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
+  ; GFX908: bb.2:
+  ; GFX908:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; GFX908:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec
+  ; GFX908:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec
+  ; GFX908:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+  ; GFX908:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY11]], implicit $exec
+  ; GFX908:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub0, implicit $exec
+  ; GFX908:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub1, implicit $exec
+  ; GFX908:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+  ; GFX908:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY12]], implicit $exec
+  ; GFX908:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
+  ; GFX908:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX908:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; GFX908:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
+  ; GFX908:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
+  ; GFX908:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+  ; GFX908:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX908:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX908:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; GFX908:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
+  ; GFX908: bb.3:
+  ; GFX908:   successors: %bb.4(0x80000000)
+  ; GFX908:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
+  ; GFX908: bb.4:
+  ; GFX908:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; GFX90A: bb.1 (%ir-block.0):
+  ; GFX90A:   successors: %bb.2(0x80000000)
+  ; GFX90A:   liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX90A:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX90A:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX90A:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX90A:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX90A:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+  ; GFX90A:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+  ; GFX90A:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+  ; GFX90A:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+  ; GFX90A:   [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+  ; GFX90A:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
+  ; GFX90A: bb.2:
+  ; GFX90A:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; GFX90A:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec
+  ; GFX90A:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec
+  ; GFX90A:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+  ; GFX90A:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY11]], implicit $exec
+  ; GFX90A:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub0, implicit $exec
+  ; GFX90A:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub1, implicit $exec
+  ; GFX90A:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+  ; GFX90A:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY12]], implicit $exec
+  ; GFX90A:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
+  ; GFX90A:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX90A:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; GFX90A:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
+  ; GFX90A:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
+  ; GFX90A:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+  ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX90A:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; GFX90A:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
+  ; GFX90A: bb.3:
+  ; GFX90A:   successors: %bb.4(0x80000000)
+  ; GFX90A:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
+  ; GFX90A: bb.4:
+  ; GFX90A:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
 
 ; All register operands need legalization, no voffset
 define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset(float inreg %val, <4 x i32> %rsrc, i32 inreg %vindex, i32 %soffset) {
-  ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
-  ; CHECK:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
-  ; CHECK:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
-  ; CHECK:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
-  ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
-  ; CHECK: bb.2:
-  ; CHECK:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
-  ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec
-  ; CHECK:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
-  ; CHECK:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec
-  ; CHECK:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec
-  ; CHECK:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec
-  ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
-  ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec
-  ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
-  ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
-  ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
-  ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
-  ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-  ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
-  ; CHECK: bb.3:
-  ; CHECK:   successors: %bb.4(0x80000000)
-  ; CHECK:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
-  ; CHECK: bb.4:
-  ; CHECK:   S_ENDPGM 0
+  ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908:   successors: %bb.2(0x80000000)
+  ; GFX908:   liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX908:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX908:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX908:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX908:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX908:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX908:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+  ; GFX908:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+  ; GFX908:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+  ; GFX908:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+  ; GFX908:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
+  ; GFX908: bb.2:
+  ; GFX908:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; GFX908:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec
+  ; GFX908:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec
+  ; GFX908:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+  ; GFX908:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec
+  ; GFX908:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec
+  ; GFX908:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec
+  ; GFX908:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+  ; GFX908:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec
+  ; GFX908:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
+  ; GFX908:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX908:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; GFX908:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+  ; GFX908:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
+  ; GFX908:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX908:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX908:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; GFX908:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
+  ; GFX908: bb.3:
+  ; GFX908:   successors: %bb.4(0x80000000)
+  ; GFX908:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
+  ; GFX908: bb.4:
+  ; GFX908:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset
+  ; GFX90A: bb.1 (%ir-block.0):
+  ; GFX90A:   successors: %bb.2(0x80000000)
+  ; GFX90A:   liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX90A:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX90A:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX90A:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX90A:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+  ; GFX90A:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+  ; GFX90A:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+  ; GFX90A:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+  ; GFX90A:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
+  ; GFX90A: bb.2:
+  ; GFX90A:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; GFX90A:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec
+  ; GFX90A:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec
+  ; GFX90A:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+  ; GFX90A:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec
+  ; GFX90A:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec
+  ; GFX90A:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec
+  ; GFX90A:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+  ; GFX90A:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec
+  ; GFX90A:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
+  ; GFX90A:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX90A:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; GFX90A:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+  ; GFX90A:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
+  ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX90A:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; GFX90A:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
+  ; GFX90A: bb.3:
+  ; GFX90A:   successors: %bb.4(0x80000000)
+  ; GFX90A:   $exec = S_MOV_B64_term [[S_MOV_B64_term]]
+  ; GFX90A: bb.4:
+  ; GFX90A:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
   ret void
 }
 
 ; Natural mapping + slc
 define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
-  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; CHECK:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:   S_ENDPGM 0
+  ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+  ; GFX908:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX908:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX908:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX908:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX908:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
+  ; GFX90A: bb.1 (%ir-block.0):
+  ; GFX90A:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+  ; GFX90A:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX90A:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
   ret void
 }
 
 define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
-  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:   S_ENDPGM 0
+  ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+  ; GFX908:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX908:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX908:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc
+  ; GFX90A: bb.1 (%ir-block.0):
+  ; GFX90A:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+  ; GFX90A:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX90A:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2)
   ret void
 }
 
 define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
-  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; CHECK:   BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:   S_ENDPGM 0
+  ; GFX908-LABEL: name: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+  ; GFX908:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX908:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX908:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX908:   BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX908:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX90A: bb.1 (%ir-block.0):
+  ; GFX90A:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+  ; GFX90A:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX90A:   [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX90A:   S_ENDPGM 0
   %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
 
 define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
-  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; CHECK:   BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:   S_ENDPGM 0
+  ; GFX908-LABEL: name: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset
+  ; GFX908: bb.1 (%ir-block.0):
+  ; GFX908:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+  ; GFX908:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX908:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX908:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX908:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset
+  ; GFX90A: bb.1 (%ir-block.0):
+  ; GFX90A:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+  ; GFX90A:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GFX90A:   S_ENDPGM 0
   %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll
index 746829b9da28..88d6711ddcd1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll
@@ -15,7 +15,7 @@ define amdgpu_ps half @struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vg
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]]
   ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; PACKED-LABEL: name: struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
@@ -30,7 +30,7 @@ define amdgpu_ps half @struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vg
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -50,7 +50,7 @@ define amdgpu_ps <2 x half> @struct_buffer_load_format_v2f16__sgpr_rsrc__vgpr_vi
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub0
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub1
   ; UNPACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
@@ -76,7 +76,7 @@ define amdgpu_ps <2 x half> @struct_buffer_load_format_v2f16__sgpr_rsrc__vgpr_vi
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_XY_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_XY_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_BOTHEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call <2 x half> @llvm.amdgcn.struct.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -102,7 +102,7 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__sgpr_rsrc__vgpr_vi
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub0
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1
   ; UNPACKED:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub2
@@ -138,7 +138,7 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__sgpr_rsrc__vgpr_vi
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0
   ; PACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1
   ; PACKED:   $vgpr0 = COPY [[COPY7]]
@@ -183,7 +183,7 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin
   ; UNPACKED:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
   ; UNPACKED:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
-  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -247,7 +247,7 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin
   ; PACKED:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; PACKED:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
   ; PACKED:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
-  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -277,7 +277,7 @@ define amdgpu_ps half @struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vg
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]]
   ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; PACKED-LABEL: name: struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffsset_add_4095
@@ -292,7 +292,7 @@ define amdgpu_ps half @struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vg
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %voffset = add i32 %voffset.base, 4095
@@ -313,7 +313,7 @@ define amdgpu_ps half @struct_buffer_load_format_i16__sgpr_rsrc__vgpr_vindex__vg
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]]
   ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; PACKED-LABEL: name: struct_buffer_load_format_i16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
@@ -328,7 +328,7 @@ define amdgpu_ps half @struct_buffer_load_format_i16__sgpr_rsrc__vgpr_vindex__vg
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call i16 @llvm.amdgcn.struct.buffer.load.format.i16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll
index a4636b71d086..1c225364bbae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll
@@ -15,7 +15,7 @@ define amdgpu_ps float @struct_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__v
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -35,7 +35,7 @@ define amdgpu_ps <2 x float> @struct_buffer_load_format_v2f32__sgpr_rsrc__vgpr_v
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_BOTHEN]].sub1
   ; CHECK:   $vgpr0 = COPY [[COPY7]]
@@ -58,7 +58,7 @@ define amdgpu_ps <3 x float> @struct_buffer_load_format_v3f32__sgpr_rsrc__vgpr_v
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub0
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub1
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub2
@@ -83,7 +83,7 @@ define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__sgpr_rsrc__vgpr_v
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub2
@@ -132,7 +132,7 @@ define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__vpr_rsrc__sgpr_vi
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -166,7 +166,7 @@ define amdgpu_ps float @struct_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__v
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %voffset = add i32 %voffset.base, 4095
@@ -187,7 +187,7 @@ define amdgpu_ps float @struct_buffer_load_format_i32__sgpr_rsrc__vgpr_vindex__v
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
index a455c3e19fd4..e736692d6086 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
@@ -16,7 +16,7 @@ define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -37,7 +37,7 @@ define amdgpu_ps <2 x float> @struct_buffer_load_v2f32__sgpr_rsrc__vgpr_vindex__
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_DWORDX2_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORDX2_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN]].sub0
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN]].sub1
   ; CHECK:   $vgpr0 = COPY [[COPY7]]
@@ -61,7 +61,7 @@ define amdgpu_ps <3 x float> @struct_buffer_load_v3f32__sgpr_rsrc__vgpr_vindex__
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_DWORDX3_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORDX3_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_BOTHEN]].sub0
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_BOTHEN]].sub1
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_BOTHEN]].sub2
@@ -87,7 +87,7 @@ define amdgpu_ps <4 x float> @struct_buffer_load_v4f32__sgpr_rsrc__vgpr_vindex__
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub0
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub1
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub2
@@ -116,7 +116,7 @@ define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %voffset, i32 %soffset, i32 0)
@@ -137,7 +137,7 @@ define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %voffset = add i32 %voffset.base, 4095
@@ -158,7 +158,7 @@ define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 64
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 64, i32 0)
@@ -200,7 +200,7 @@ define amdgpu_ps float @struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_vof
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -227,7 +227,7 @@ define amdgpu_ps float @struct_buffer_load_i8_zext__sgpr_rsrc__vgpr_vindex__vgpr
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_UBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_UBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_BOTHEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -249,7 +249,7 @@ define amdgpu_ps float @struct_buffer_load_i8_sext__sgpr_rsrc__vgpr_vindex__vgpr
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_UBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_UBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4)
   ; CHECK:   [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_BOTHEN]], 0, 8, implicit $exec
   ; CHECK:   $vgpr0 = COPY [[V_BFE_I32_e64_]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -272,7 +272,7 @@ define amdgpu_ps float @struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgp
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_USHORT_BOTHEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -294,7 +294,7 @@ define amdgpu_ps float @struct_buffer_load_i16_sext__sgpr_rsrc__vgpr_vindex__vgp
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_USHORT_BOTHEN]], 0, 16, implicit $exec
   ; CHECK:   $vgpr0 = COPY [[V_BFE_I32_e64_]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -318,7 +318,7 @@ define amdgpu_ps half @struct_buffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voff
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_USHORT_BOTHEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -339,7 +339,7 @@ define amdgpu_ps <2 x half> @struct_buffer_load_v2f16__sgpr_rsrc__vgpr_vindex__v
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -366,7 +366,7 @@ define amdgpu_ps <4 x half> @struct_buffer_load_v4f16__sgpr_rsrc__vgpr_vindex__v
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_DWORDX2_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORDX2_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN]].sub0
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN]].sub1
   ; CHECK:   $vgpr0 = COPY [[COPY7]]
@@ -390,7 +390,7 @@ define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 1)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll
index 3dceb920d81a..2276bdde843f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll
@@ -16,7 +16,7 @@ define amdgpu_ps void @struct_buffer_store_format_f16__vgpr_val__sgpr_rsrc__vgpr
   ; UNPACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: struct_buffer_store_format_f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
   ; PACKED: bb.1 (%ir-block.0):
@@ -31,7 +31,7 @@ define amdgpu_ps void @struct_buffer_store_format_f16__vgpr_val__sgpr_rsrc__vgpr
   ; PACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -55,7 +55,7 @@ define amdgpu_ps void @struct_buffer_store_format_v2f16__vgpr_val__sgpr_rsrc__vg
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
   ; UNPACKED:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_BOTHEN_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_BOTHEN_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: struct_buffer_store_format_v2f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
   ; PACKED: bb.1 (%ir-block.0):
@@ -70,7 +70,7 @@ define amdgpu_ps void @struct_buffer_store_format_v2f16__vgpr_val__sgpr_rsrc__vg
   ; PACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -103,7 +103,7 @@ define amdgpu_ps void @struct_buffer_store_format_v4f16__vgpr_val__sgpr_rsrc__vg
   ; UNPACKED:   [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY10]], [[COPY1]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_BOTHEN_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_BOTHEN_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: struct_buffer_store_format_v4f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
   ; PACKED: bb.1 (%ir-block.0):
@@ -120,7 +120,7 @@ define amdgpu_ps void @struct_buffer_store_format_v4f16__vgpr_val__sgpr_rsrc__vg
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -162,7 +162,7 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr
   ; UNPACKED:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
   ; UNPACKED:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -206,7 +206,7 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr
   ; PACKED:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
   ; PACKED:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
   ; PACKED:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -233,7 +233,7 @@ define amdgpu_ps void @struct_buffer_store_format_i16__vgpr_val__sgpr_rsrc__vgpr
   ; UNPACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: struct_buffer_store_format_i16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
   ; PACKED: bb.1 (%ir-block.0):
@@ -248,7 +248,7 @@ define amdgpu_ps void @struct_buffer_store_format_i16__vgpr_val__sgpr_rsrc__vgpr
   ; PACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.format.i16(i16 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll
index 128b87306651..03a37e4ed88e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll
@@ -15,7 +15,7 @@ define amdgpu_ps void @struct_buffer_store_format_f32__vgpr_val__sgpr_rsrc__vgpr
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -37,7 +37,7 @@ define amdgpu_ps void @struct_buffer_store_format_v2f32__vgpr_val__sgpr_rsrc__vg
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -60,7 +60,7 @@ define amdgpu_ps void @struct_buffer_store_format_v3f32__vgpr_val__sgpr_rsrc__vg
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZ_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZ_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -84,7 +84,7 @@ define amdgpu_ps void @struct_buffer_store_format_v4f32__vgpr_val__sgpr_rsrc__vg
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -126,7 +126,7 @@ define amdgpu_ps void @struct_buffer_store_format_f32__sgpr_val__vgpr_rsrc__sgpr
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -153,7 +153,7 @@ define amdgpu_ps void @struct_buffer_store_format_i32__vgpr_val__sgpr_rsrc__vgpr
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.format.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll
index ae7dabf23de8..de0de1a3aa93 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll
@@ -17,7 +17,7 @@ define amdgpu_ps void @struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex_
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -39,7 +39,7 @@ define amdgpu_ps void @struct_buffer_store_v2f32_sgpr_rsrc__vgpr_val__vgpr_vinde
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORDX2_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -62,7 +62,7 @@ define amdgpu_ps void @struct_buffer_store_v3f32_sgpr_rsrc__vgpr_val__vgpr_vinde
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORDX3_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX3_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -86,7 +86,7 @@ define amdgpu_ps void @struct_buffer_store_v4f32_sgpr_rsrc__vgpr_val__vgpr_vinde
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORDX4_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX4_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -132,7 +132,7 @@ define amdgpu_ps void @struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vinde
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORDX4_BOTHEN_exact [[COPY11]], [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX4_BOTHEN_exact [[COPY11]], [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -159,7 +159,7 @@ define amdgpu_ps void @struct_buffer_store_i8_sgpr_rsrc__vgpr_val__vgpr_vindex__
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_BYTE_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
+  ; CHECK:   BUFFER_STORE_BYTE_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %val.trunc = trunc i32 %val to i8
   call void @llvm.amdgcn.struct.buffer.store.i8(i8 %val.trunc, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -180,7 +180,7 @@ define amdgpu_ps void @struct_buffer_store_i16_sgpr_rsrc__vgpr_val__vgpr_vindex_
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_SHORT_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_SHORT_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %val.trunc = trunc i32 %val to i16
   call void @llvm.amdgcn.struct.buffer.store.i16(i16 %val.trunc, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -201,7 +201,7 @@ define amdgpu_ps void @struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex_
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 1)
   ret void
@@ -221,7 +221,7 @@ define amdgpu_ps void @struct_buffer_store_v2f16_sgpr_rsrc__vgpr_val__vgpr_vinde
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -249,7 +249,7 @@ define amdgpu_ps void @struct_buffer_store_v4f16_sgpr_rsrc__vgpr_val__vgpr_vinde
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORDX2_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll
index 8fef0ad03d57..316f27dd78c9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll
@@ -16,7 +16,7 @@ define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
@@ -31,7 +31,7 @@ define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]]
   ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0)
@@ -51,8 +51,8 @@ define amdgpu_ps <2 x half> @struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
-  ; PACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN]]
+  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; UNPACKED-LABEL: name: struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
   ; UNPACKED: bb.1 (%ir-block.0):
@@ -66,7 +66,7 @@ define amdgpu_ps <2 x half> @struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub0
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub1
   ; UNPACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
@@ -103,7 +103,7 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0
   ; PACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1
   ; PACKED:   $vgpr0 = COPY [[COPY7]]
@@ -121,7 +121,7 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub0
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1
   ; UNPACKED:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub2
@@ -163,7 +163,7 @@ define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; PACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0
@@ -179,7 +179,7 @@ define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; UNPACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]]
   ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 %voffset, i32 %soffset, i32 78, i32 0)
@@ -220,7 +220,7 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__
   ; PACKED:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; PACKED:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
   ; PACKED:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
-  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
+  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -266,7 +266,7 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__
   ; UNPACKED:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
   ; UNPACKED:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
-  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
+  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -314,7 +314,7 @@ define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095
@@ -329,7 +329,7 @@ define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]]
   ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %voffset = add i32 %voffset.base, 4095

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll
index 9b142f1ec10c..9a4b1ac67474 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll
@@ -15,7 +15,7 @@ define amdgpu_ps float @struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vo
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_BOTHEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0)
@@ -35,7 +35,7 @@ define amdgpu_ps <2 x float> @struct_tbuffer_load_v2f32__sgpr_rsrc__vgpr_vindex_
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub1
   ; CHECK:   $vgpr0 = COPY [[COPY7]]
@@ -58,7 +58,7 @@ define amdgpu_ps <3 x float> @struct_tbuffer_load_v3f32__sgpr_rsrc__vgpr_vindex_
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub0
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub1
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub2
@@ -83,7 +83,7 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__sgpr_rsrc__vgpr_vindex_
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub2
@@ -111,7 +111,7 @@ define amdgpu_ps float @struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vo
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_BOTHEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %voffset, i32 %soffset, i32 78, i32 0)
@@ -152,7 +152,7 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
-  ; CHECK:   [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -186,7 +186,7 @@ define amdgpu_ps float @struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vo
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4)
+  ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_BOTHEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %voffset = add i32 %voffset.base, 4095

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx90a.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx90a.mir
new file mode 100644
index 000000000000..8ea58fac3aba
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx90a.mir
@@ -0,0 +1,206 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=FAST
+# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GREEDY
+
+---
+name: mfma_f32_32x32x4bf16_1k_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+
+    ; FAST-LABEL: name: mfma_f32_32x32x4bf16_1k_vva
+    ; FAST: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; FAST: [[COPY2:%[0-9]+]]:agpr(<32 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FAST: [[INT:%[0-9]+]]:agpr(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x4bf16.1k), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<32 x s32>), 0, 0, 0
+    ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY [[INT]](<32 x s32>)
+    ; GREEDY-LABEL: name: mfma_f32_32x32x4bf16_1k_vva
+    ; GREEDY: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<32 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; GREEDY: [[INT:%[0-9]+]]:agpr(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x4bf16.1k), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<32 x s32>), 0, 0, 0
+    ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY [[INT]](<32 x s32>)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(<32 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    %3:_(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x4bf16.1k), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY %3
+...
+
+---
+name: mfma_f32_16x16x4bf16_1k_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+
+    ; FAST-LABEL: name: mfma_f32_16x16x4bf16_1k_vva
+    ; FAST: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; FAST: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FAST: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x4bf16.1k), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<16 x s32>), 0, 0, 0
+    ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+    ; GREEDY-LABEL: name: mfma_f32_16x16x4bf16_1k_vva
+    ; GREEDY: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GREEDY: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x4bf16.1k), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<16 x s32>), 0, 0, 0
+    ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x4bf16.1k), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3
+...
+
+---
+name: mfma_f32_4x4x4bf16_1k_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3
+
+    ; FAST-LABEL: name: mfma_f32_4x4x4bf16_1k_vva
+    ; FAST: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3
+    ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; FAST: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    ; FAST: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.4x4x4bf16.1k), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<4 x s32>), 0, 0, 0
+    ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+    ; GREEDY-LABEL: name: mfma_f32_4x4x4bf16_1k_vva
+    ; GREEDY: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3
+    ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    ; GREEDY: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.4x4x4bf16.1k), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<4 x s32>), 0, 0, 0
+    ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.4x4x4bf16.1k), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
+---
+name: mfma_f32_32x32x8bf16_1k_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+
+    ; FAST-LABEL: name: mfma_f32_32x32x8bf16_1k_vva
+    ; FAST: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; FAST: [[COPY2:%[0-9]+]]:agpr(<32 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FAST: [[INT:%[0-9]+]]:agpr(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x8bf16.1k), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<32 x s32>), 0, 0, 0
+    ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY [[INT]](<32 x s32>)
+    ; GREEDY-LABEL: name: mfma_f32_32x32x8bf16_1k_vva
+    ; GREEDY: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<32 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; GREEDY: [[INT:%[0-9]+]]:agpr(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x8bf16.1k), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<32 x s32>), 0, 0, 0
+    ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY [[INT]](<32 x s32>)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(<32 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    %3:_(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x8bf16.1k), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY %3
+...
+
+---
+name: mfma_f32_16x16x16bf16_1k_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3
+
+    ; FAST-LABEL: name: mfma_f32_16x16x16bf16_1k_vva
+    ; FAST: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3
+    ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; FAST: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    ; FAST: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x16bf16.1k), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<4 x s32>), 0, 0, 0
+    ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+    ; GREEDY-LABEL: name: mfma_f32_16x16x16bf16_1k_vva
+    ; GREEDY: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3
+    ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    ; GREEDY: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x16bf16.1k), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<4 x s32>), 0, 0, 0
+    ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x16bf16.1k), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
+---
+name: mfma_f64_16x16x4f64_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+
+    ; FAST-LABEL: name: mfma_f64_16x16x4f64_vva
+    ; FAST: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; FAST: [[COPY2:%[0-9]+]]:agpr(<8 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; FAST: [[INT:%[0-9]+]]:agpr(<8 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f64.16x16x4f64), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<8 x s32>), 0, 0, 0
+    ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INT]](<8 x s32>)
+    ; GREEDY-LABEL: name: mfma_f64_16x16x4f64_vva
+    ; GREEDY: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<8 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GREEDY: [[INT:%[0-9]+]]:agpr(<8 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f64.16x16x4f64), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<8 x s32>), 0, 0, 0
+    ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INT]](<8 x s32>)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(<8 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    %3:_(<8 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f64.16x16x4f64), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %3
+...
+
+---
+name: mfma_f64_4x4x4f64_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1
+
+    ; FAST-LABEL: name: mfma_f64_4x4x4f64_vva
+    ; FAST: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1
+    ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; FAST: [[COPY2:%[0-9]+]]:agpr(<2 x s32>) = COPY $agpr0_agpr1
+    ; FAST: [[INT:%[0-9]+]]:agpr(<2 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f64.4x4x4f64), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<2 x s32>), 0, 0, 0
+    ; FAST: $vgpr0_vgpr1 = COPY [[INT]](<2 x s32>)
+    ; GREEDY-LABEL: name: mfma_f64_4x4x4f64_vva
+    ; GREEDY: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1
+    ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<2 x s32>) = COPY $agpr0_agpr1
+    ; GREEDY: [[INT:%[0-9]+]]:agpr(<2 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f64.4x4x4f64), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<2 x s32>), 0, 0, 0
+    ; GREEDY: $vgpr0_vgpr1 = COPY [[INT]](<2 x s32>)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(<2 x s32>) = COPY $agpr0_agpr1
+    %3:_(<2 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f64.4x4x4f64), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1 = COPY %3
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.mir b/llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.mir
index f634b8aab303..a897f991a005 100644
--- a/llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.mir
+++ b/llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.mir
@@ -39,7 +39,7 @@ body:             |
 
   bb.1:
     renamable $vgpr0 = V_MOV_B32_e32 1065353216, implicit $exec
-    BUFFER_STORE_DWORD_OFFEN killed renamable $vgpr0, undef renamable $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    BUFFER_STORE_DWORD_OFFEN killed renamable $vgpr0, undef renamable $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
 
   bb.2:
     S_ENDPGM 0

diff  --git a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
new file mode 100644
index 000000000000..c4715ba5b71c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
@@ -0,0 +1,316 @@
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
+declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+; GCN-LABEL:  {{^}}test_load_mfma_store16:
+; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
+; GCN-NOT:     v_accvgpr_write
+; GCN:         v_mfma_f32_32x32x1f32
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 2
+; GCN-NOT:     v_accvgpr_read
+; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
+define amdgpu_kernel void @test_load_mfma_store16(<32 x float> addrspace(1)* %arg) {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_load1_mfma_store1:
+; GCN:      global_load_dword a{{[0-9]+}}, v{{[0-9:]+}}, s[{{[0-9:]+}}]
+; GCN-NOT:  v_accvgpr_read
+; GCN:      v_mfma_f32_32x32x1f32 a{{\[}}[[N:[0-9]+]]:
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: s_nop 2
+; GCN-NOT:  v_accvgpr_read
+; GCN-NEXT: global_store_dword v{{[0-9:]+}}, a[[N]], s[{{[0-9:]+}}]
+define amdgpu_kernel void @test_load1_mfma_store1(float addrspace(1)* %arg) {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tid
+  %in.1 = load float, float addrspace(1)* %gep
+  %init = insertelement <32 x float> zeroinitializer, float %in.1, i32 0
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %init, i32 1, i32 2, i32 3)
+  %elt = extractelement <32 x float> %mai.1, i32 0
+  store float %elt, float addrspace(1)* %gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_load4_mfma_store4:
+; GCN:      global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
+; GCN-NOT:  v_accvgpr_write
+; GCN:      v_mfma_i32_4x4x4i8 [[A:a\[[0-9:]+\]]]
+; GCN-NEXT: s_nop 4
+; GCN-NOT:  v_accvgpr_read
+; GCN-NEXT: global_store_dwordx4 v{{[0-9:]+}}, [[A]], s[{{[0-9:]+}}]
+define amdgpu_kernel void @test_load4_mfma_store4(<4 x i32> addrspace(1)* %arg) {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %tid
+  %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %gep
+  %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 0, i32 0, i32 0)
+  store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_load_store:
+; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
+; GCN-NOT:     v_accvgpr
+; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}]
+define amdgpu_kernel void @test_load_store(<32 x float> addrspace(1)* %arg) {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
+  %gep.2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %gep.1, i32 32
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep.1
+  store <32 x float> %in.1, <32 x float> addrspace(1)* %gep.2
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_load_add_mfma_store:
+; GCN-COUNT-8:  global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
+; GCN-COUNT-32: v_accvgpr_write
+; GCN:          v_mfma_f32_32x32x1f32
+; GCN-NEXT:     s_nop 7
+; GCN-NEXT:     s_nop 7
+; GCN-NEXT:     s_nop 2
+; GCN-NOT:      v_accvgpr_read
+; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}]
+define amdgpu_kernel void @test_load_add_mfma_store(<32 x float> addrspace(1)* %arg) {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
+  %add.1 = fadd <32 x float> %in.1, %in.1
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %add.1, i32 1, i32 2, i32 3)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_load_add_store:
+; GCN-COUNT-8:  global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
+; GCN-NOT:      v_accvgpr
+; GCN-COUNT-16: v_pk_add_f32
+; GCN-NOT:      v_accvgpr
+; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}]
+define amdgpu_kernel void @test_load_add_store(<32 x float> addrspace(1)* %arg) {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
+  %add.1 = fadd <32 x float> %in.1, %in.1
+  store <32 x float> %add.1, <32 x float> addrspace(1)* %gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_load_mfma_add_store:
+; GCN-COUNT-8:  global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
+; GCN-COUNT-32: v_accvgpr_write
+; GCN:          v_mfma_f32_32x32x1f32
+; GCN-COUNT-32: v_accvgpr_read
+; GCN:          v_pk_add_f32
+; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}]
+define amdgpu_kernel void @test_load_mfma_add_store(<32 x float> addrspace(1)* %arg) {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
+  %add.1 = fadd <32 x float> %mai.1, %in.1
+  store <32 x float> %add.1, <32 x float> addrspace(1)* %gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_load_add_mfma_mul_store:
+; GCN-COUNT-8:  global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
+; GCN:          v_pk_add_f32
+; GCN-COUNT-32: v_accvgpr_write
+; GCN:          v_mfma_f32_32x32x1f32
+; GCN-COUNT-32: v_accvgpr_read
+; GCN:          v_pk_mul_f32
+; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}]
+define amdgpu_kernel void @test_load_add_mfma_mul_store(<32 x float> addrspace(1)* %arg) {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
+  %add.1 = fadd <32 x float> %in.1, %in.1
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %add.1, i32 1, i32 2, i32 3)
+  %mul.1 = fmul <32 x float> %mai.1, %mai.1
+  store <32 x float> %mul.1, <32 x float> addrspace(1)* %gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mixeduse_load_add_mfma_mul_store:
+; GCN-COUNT-8:  global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
+; GCN-COUNT-32: v_accvgpr_write
+; GCN:          v_mfma_f32_32x32x1f32
+; GCN-COUNT-32: v_accvgpr_read
+; GCN:          v_pk_mul_f32
+; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}]
+define amdgpu_kernel void @test_mixeduse_load_add_mfma_mul_store(<32 x float> addrspace(1)* %arg) {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
+  %add.1 = fadd <32 x float> %in.1, %in.1
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %add.1, i32 1, i32 2, i32 3)
+  %mul.1 = fmul <32 x float> %mai.1, %in.1
+  store <32 x float> %mul.1, <32 x float> addrspace(1)* %gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_multiuse_load_mfma_mfma_store:
+; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
+; GCN-NOT:     v_accvgpr_write
+; GCN:         v_mfma_f32_32x32x1f32
+; GCN-NOT:     v_accvgpr_read
+; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}]
+define amdgpu_kernel void @test_multiuse_load_mfma_mfma_store(<32 x float> addrspace(1)* %arg) {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
+  %gep.2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %gep.1, i32 32
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep.1
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
+  %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep.1
+  store <32 x float> %mai.2, <32 x float> addrspace(1)* %gep.2
+  ret void
+}
+
+; NB: for atomics both vdata and vdst shall be either VGPR or AGPR
+; GCN-LABEL: {{^}}test_atomic_mfma_4xi32_atomic_store:
+; GCN:     global_atomic_sub [[IN:v[0-9]+]], v{{[0-9:]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}] glc
+; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[IN]]
+; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN:     v_mfma_i32_4x4x4i8 a{{\[}}[[N:[0-9]+]]:
+; GCN:     v_accvgpr_read_b32 [[V:v[0-9]+]], a[[N]]{{$}}
+; GCN:     global_atomic_add v{{[0-9]+}}, v{{[0-9:]+}}, [[V]], s[{{[0-9:]+}}] glc
+; GCN:     global_store_dword v{{[0-9]+}}, v{{[0-9]+}},
+define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic_store(i32 addrspace(1)* %arg) {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tid
+  %in.1 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 1 seq_cst
+  %tmp0 = insertelement <4 x i32> undef, i32 %in.1, i32 0
+  %tmp1 = insertelement <4 x i32> %tmp0, i32 0, i32 1
+  %tmp2 = insertelement <4 x i32> %tmp1, i32 0, i32 2
+  %tmp3 = insertelement <4 x i32> %tmp2, i32 0, i32 3
+  %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %tmp3, i32 0, i32 0, i32 0)
+  %elt = extractelement <4 x i32> %mai.1, i32 0
+  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %elt seq_cst
+  store i32 %val, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_atomic_mfma_4xi32_atomic64_store:
+; GCN:         global_atomic_sub_x2 v[{{[0-9:]+}}], v{{[0-9:]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}] glc
+; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN:         v_mfma_i32_4x4x4i8 a{{\[}}[[N:[0-9]+]]:
+; GCN:         v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}}
+; GCN:         v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}}
+; GCN:         global_atomic_add_x2 v[{{[0-9:]+}}], v{{[0-9:]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}] glc
+define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic64_store(i64 addrspace(1)* %arg) {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tid
+  %in.1 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 1 seq_cst
+  %tmp0 = insertelement <2 x i64> undef, i64 %in.1, i32 0
+  %tmp1 = insertelement <2 x i64> %tmp0, i64 0, i32 1
+  %tmp2 = bitcast <2 x i64> %tmp0 to <4 x i32>
+  %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %tmp2, i32 0, i32 0, i32 0)
+  %elt.1 = extractelement <4 x i32> %mai.1, i32 0
+  %elt.2 = extractelement <4 x i32> %mai.1, i32 1
+  %v2.1 = insertelement <2 x i32> undef, i32 %elt.1, i32 0
+  %v2.2 = insertelement <2 x i32> %v2.1, i32 %elt.2, i32 1
+  %v2 = bitcast <2 x i32> %v2.2 to i64
+  %val = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %v2 seq_cst
+  store i64 %val, i64 addrspace(1)* %arg
+  ret void
+}
+
+; NB: both data operands should be VGPR or AGPR
+; GCN-LABEL: {{^}}test_load_mfma_ds2_store:
+; GCN-DAG: ds_read_b128 [[IN:a\[[0-9:]+\]]], v{{[0-9:]+}}
+; GCN-NOT: v_accvgpr_write
+; GCN-DAG: v_mfma_i32_4x4x4i8 a{{\[}}[[N:[0-9]+]]:{{[0-9]+}}], v{{[0-9:]+}}, v{{[0-9:]+}}, [[IN]]
+; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-NOT: v_accvgpr_read
+; GCN:     ds_write_b32 v{{[0-9]+}}, a[[N]] offset:128
+define amdgpu_kernel void @test_load_mfma_ds2_store(<4 x i32> addrspace(3)* %arg) {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(3)* %arg, i32 %tid
+  %in.1 = load <4 x i32>, <4 x i32> addrspace(3)* %gep.1
+  %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 0, i32 0, i32 0)
+  %elt = extractelement <4 x i32> %mai.1, i32 0
+  %ptr = bitcast <4 x i32> addrspace(3)* %arg to i32 addrspace(3)*
+  %gep.2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr, i32 32
+  store i32 1, i32 addrspace(3)* %ptr
+  store i32 %elt, i32 addrspace(3)* %gep.2
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_loop_4xi32:
+; GCN:     global_load_dwordx4 [[IN:a\[[0-9:]+\]]], v{{[0-9:]+}}, s[{{[0-9:]+}}]
+; GCN-NOT: v_accvgpr_write
+; GCN:     v_mfma_i32_4x4x4i8 [[RES:a\[[0-9:]+\]]], v{{[0-9:]+}}, v{{[0-9:]+}}, [[IN]]
+; GCN-NOT: v_accvgpr_read
+; GCN:     global_store_dwordx4 v[{{[0-9:]+}}], [[RES]],
+define amdgpu_kernel void @test_mfma_loop_4xi32(<4 x i32> addrspace(1)* %arg) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %tid
+  %in = load <4 x i32>, <4 x i32> addrspace(1)* %gep
+  br label %for.cond.preheader
+
+for.cond.preheader:
+  %phi = phi <4 x i32> [ %in, %entry ], [ %mai.1, %for.cond.preheader ]
+  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
+  %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %phi, i32 0, i32 0, i32 0)
+  %inc = add nuw nsw i32 %c, 1
+  %cc = icmp eq i32 %inc, 16
+  br i1 %cc, label %exit, label %for.cond.preheader
+
+exit:
+  store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_loop_32xfloat:
+; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
+; GCN-NOT:     v_accvgpr_write
+; GCN:         v_mfma_f32_32x32x1f32
+; GCN-NOT:     v_accvgpr_read
+; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}],
+; GCN:         s_endpgm
+define amdgpu_kernel void @test_mfma_loop_32xfloat(<32 x float> addrspace(1)* %arg) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
+  %in = load <32 x float>, <32 x float> addrspace(1)* %gep
+  br label %for.cond.preheader
+
+for.cond.preheader:
+  %phi = phi <32 x float> [ %in, %entry ], [ %mai.1, %for.cond.preheader ]
+  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
+  %inc = add nuw nsw i32 %c, 1
+  %cc = icmp eq i32 %inc, 16
+  br i1 %cc, label %exit, label %for.cond.preheader
+
+exit:
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
index 9cbf501ce8e5..c3c68e80a1c0 100644
--- a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass postrapseudos -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass postrapseudos -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX908 %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass postrapseudos -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX90A %s
 
 --- |
     define amdgpu_kernel void @a_to_v() #0 { ret void }
@@ -49,10 +50,14 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $agpr0
-    ; GCN-LABEL: name: a_to_v
-    ; GCN: liveins: $agpr0
-    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $vgpr0
+    ; GFX908-LABEL: name: a_to_v
+    ; GFX908: liveins: $agpr0
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $vgpr0
+    ; GFX90A-LABEL: name: a_to_v
+    ; GFX90A: liveins: $agpr0
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $vgpr0
     $vgpr0 = COPY killed $agpr0, implicit $exec
     S_ENDPGM 0, implicit $vgpr0
 ...
@@ -64,11 +69,16 @@ body:             |
   bb.0:
     liveins: $agpr0_agpr1
 
-    ; GCN-LABEL: name: a2_to_v2
-    ; GCN: liveins: $agpr0_agpr1
-    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1
-    ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $vgpr0_vgpr1
+    ; GFX908-LABEL: name: a2_to_v2
+    ; GFX908: liveins: $agpr0_agpr1
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1
+    ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $vgpr0_vgpr1
+    ; GFX90A-LABEL: name: a2_to_v2
+    ; GFX90A: liveins: $agpr0_agpr1
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1
+    ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $vgpr0_vgpr1
     $vgpr0_vgpr1 = COPY killed $agpr0_agpr1, implicit $exec
     S_ENDPGM 0, implicit $vgpr0_vgpr1
 ...
@@ -80,12 +90,18 @@ body:             |
   bb.0:
     liveins: $agpr0_agpr1_agpr2
 
-    ; GCN-LABEL: name: a3_to_v3
-    ; GCN: liveins: $agpr0_agpr1_agpr2
-    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $agpr0_agpr1_agpr2
-    ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2
-    ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2
+    ; GFX908-LABEL: name: a3_to_v3
+    ; GFX908: liveins: $agpr0_agpr1_agpr2
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $agpr0_agpr1_agpr2
+    ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2
+    ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2
+    ; GFX90A-LABEL: name: a3_to_v3
+    ; GFX90A: liveins: $agpr0_agpr1_agpr2
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $agpr0_agpr1_agpr2
+    ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2
+    ; GFX90A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2
     $vgpr0_vgpr1_vgpr2 = COPY killed $agpr0_agpr1_agpr2, implicit $exec
     S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2
 ...
@@ -96,13 +112,20 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $agpr0_agpr1_agpr2_agpr3
-    ; GCN-LABEL: name: a4_to_v4
-    ; GCN: liveins: $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX908-LABEL: name: a4_to_v4
+    ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX90A-LABEL: name: a4_to_v4
+    ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $agpr0_agpr1_agpr2_agpr3, implicit $exec
     S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3
 ...
@@ -114,17 +137,28 @@ body:             |
   bb.0:
     liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
 
-    ; GCN-LABEL: name: a8_to_v8
-    ; GCN: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; GCN: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; GCN: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; GCN: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; GCN: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; GCN: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX908-LABEL: name: a8_to_v8
+    ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX908: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX908: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX908: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX908: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX908: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A-LABEL: name: a8_to_v8
+    ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX90A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX90A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX90A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX90A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX90A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX90A: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec
     S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
 ...
@@ -135,25 +169,44 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN-LABEL: name: a16_to_v16
-    ; GCN: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX908-LABEL: name: a16_to_v16
+    ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A-LABEL: name: a16_to_v16
+    ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec
     S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ...
@@ -164,10 +217,14 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $vgpr0
-    ; GCN-LABEL: name: v_to_a
-    ; GCN: liveins: $vgpr0
-    ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr0
+    ; GFX908-LABEL: name: v_to_a
+    ; GFX908: liveins: $vgpr0
+    ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr0
+    ; GFX90A-LABEL: name: v_to_a
+    ; GFX90A: liveins: $vgpr0
+    ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr0
     $agpr0 = COPY killed $vgpr0, implicit $exec
     S_ENDPGM 0, implicit $agpr0
 ...
@@ -178,11 +235,16 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $vgpr0_vgpr1
-    ; GCN-LABEL: name: v2_to_a2
-    ; GCN: liveins: $vgpr0_vgpr1
-    ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1
-    ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1
+    ; GFX908-LABEL: name: v2_to_a2
+    ; GFX908: liveins: $vgpr0_vgpr1
+    ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1
+    ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1
+    ; GFX90A-LABEL: name: v2_to_a2
+    ; GFX90A: liveins: $vgpr0_vgpr1
+    ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1
+    ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1
     $agpr0_agpr1 = COPY killed $vgpr0_vgpr1, implicit $exec
     S_ENDPGM 0, implicit $agpr0_agpr1
 ...
@@ -193,12 +255,18 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $vgpr0_vgpr1_vgpr2
-    ; GCN-LABEL: name: v3_to_a3
-    ; GCN: liveins: $vgpr0_vgpr1_vgpr2
-    ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $vgpr0_vgpr1_vgpr2
-    ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
-    ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
+    ; GFX908-LABEL: name: v3_to_a3
+    ; GFX908: liveins: $vgpr0_vgpr1_vgpr2
+    ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $vgpr0_vgpr1_vgpr2
+    ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
+    ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
+    ; GFX90A-LABEL: name: v3_to_a3
+    ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2
+    ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $vgpr0_vgpr1_vgpr2
+    ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
+    ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
     $agpr0_agpr1_agpr2 = COPY killed $vgpr0_vgpr1_vgpr2, implicit $exec
     S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
 ...
@@ -209,13 +277,20 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $vgpr0_vgpr1_vgpr2_vgpr3
-    ; GCN-LABEL: name: v4_to_a4
-    ; GCN: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
-    ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
-    ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
-    ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
-    ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX908-LABEL: name: v4_to_a4
+    ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A-LABEL: name: v4_to_a4
+    ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     $agpr0_agpr1_agpr2_agpr3 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
     S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
 ...
@@ -226,17 +301,28 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-    ; GCN-LABEL: name: v8_to_a8
-    ; GCN: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-    ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-    ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-    ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-    ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-    ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-    ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-    ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-    ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-    ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX908-LABEL: name: v8_to_a8
+    ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX90A-LABEL: name: v8_to_a8
+    ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
 ...
@@ -247,25 +333,44 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN-LABEL: name: v16_to_a16
-    ; GCN: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908-LABEL: name: v16_to_a16
+    ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX908: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX908: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX908: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX908: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX908: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX908: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX908: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX908: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A-LABEL: name: v16_to_a16
+    ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec
     S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
 ...
@@ -276,11 +381,16 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $sgpr0
-    ; GCN-LABEL: name: s_to_a
-    ; GCN: liveins: $sgpr0
-    ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec
-    ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr0
+    ; GFX908-LABEL: name: s_to_a
+    ; GFX908: liveins: $sgpr0
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec
+    ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr0
+    ; GFX90A-LABEL: name: s_to_a
+    ; GFX90A: liveins: $sgpr0
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec
+    ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr0
     $agpr0 = COPY killed $sgpr0, implicit $exec
     S_ENDPGM 0, implicit $agpr0
 ...
@@ -291,13 +401,20 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $sgpr0_sgpr1
-    ; GCN-LABEL: name: s2_to_a2
-    ; GCN: liveins: $sgpr0_sgpr1
-    ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1
-    ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1
-    ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1
-    ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1
+    ; GFX908-LABEL: name: s2_to_a2
+    ; GFX908: liveins: $sgpr0_sgpr1
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1
+    ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1
+    ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1
+    ; GFX90A-LABEL: name: s2_to_a2
+    ; GFX90A: liveins: $sgpr0_sgpr1
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1
+    ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1
+    ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1
     $agpr0_agpr1 = COPY killed $sgpr0_sgpr1, implicit $exec
     S_ENDPGM 0, implicit $agpr0_agpr1
 ...
@@ -308,15 +425,24 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $sgpr0_sgpr1_sgpr2
-    ; GCN-LABEL: name: s3_to_a3
-    ; GCN: liveins: $sgpr0_sgpr1_sgpr2
-    ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
-    ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
-    ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
-    ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
-    ; GCN: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2
-    ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
+    ; GFX908-LABEL: name: s3_to_a3
+    ; GFX908: liveins: $sgpr0_sgpr1_sgpr2
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
+    ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
+    ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
+    ; GFX908: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2
+    ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
+    ; GFX90A-LABEL: name: s3_to_a3
+    ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
+    ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
+    ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2
+    ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
     $agpr0_agpr1_agpr2 = COPY killed $sgpr0_sgpr1_sgpr2, implicit $exec
     S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
 ...
@@ -327,17 +453,28 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN-LABEL: name: s4_to_a4
-    ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
-    ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX908-LABEL: name: s4_to_a4
+    ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
+    ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A-LABEL: name: s4_to_a4
+    ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     $agpr0_agpr1_agpr2_agpr3 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
     S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
 ...
@@ -348,21 +485,36 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
-    ; GCN-LABEL: name: s6_to_a6
-    ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
-    ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
-    ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
-    ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
-    ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
-    ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
-    ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
-    ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
-    ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
-    ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
-    ; GCN: $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
-    ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; GFX908-LABEL: name: s6_to_a6
+    ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
+    ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
+    ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
+    ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
+    ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
+    ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
+    ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
+    ; GFX908: $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
+    ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; GFX90A-LABEL: name: s6_to_a6
+    ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
+    ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
+    ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
+    ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
+    ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
+    ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
+    ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit $exec
     S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
 ...
@@ -373,25 +525,44 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
-    ; GCN-LABEL: name: s8_to_a8
-    ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
-    ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
-    ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
-    ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
-    ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
-    ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
-    ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
-    ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
-    ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
-    ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
-    ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
-    ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
-    ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
-    ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX908-LABEL: name: s8_to_a8
+    ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
+    ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
+    ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX90A-LABEL: name: s8_to_a8
+    ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec
     S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
 ...
@@ -402,41 +573,76 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN-LABEL: name: s16_to_a16
-    ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
-    ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
-    ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
-    ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
-    ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
-    ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
-    ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
-    ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
-    ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
-    ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908-LABEL: name: s16_to_a16
+    ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
+    ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
+    ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
+    ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX908: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX908: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX908: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
+    ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX908: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX908: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX908: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
+    ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX908: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX908: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A-LABEL: name: s16_to_a16
+    ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec
     S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
 ...
@@ -446,11 +652,15 @@ name:            a_to_a
 tracksRegLiveness: true
 body:             |
   bb.0:
-    ; GCN-LABEL: name: a_to_a
-    ; GCN: $agpr1 = IMPLICIT_DEF
-    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
-    ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr0
+    ; GFX908-LABEL: name: a_to_a
+    ; GFX908: $agpr1 = IMPLICIT_DEF
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
+    ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr0
+    ; GFX90A-LABEL: name: a_to_a
+    ; GFX90A: $agpr1 = IMPLICIT_DEF
+    ; GFX90A: $agpr0 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr0
     $agpr1 = IMPLICIT_DEF
     $agpr0 = COPY killed $agpr1, implicit $exec
     S_ENDPGM 0, implicit $agpr0
@@ -462,14 +672,22 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $agpr0_agpr1
-    ; GCN-LABEL: name: a2_to_a2_kill
-    ; GCN: liveins: $agpr0_agpr1
-    ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1
-    ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr1_agpr2
-    ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1
-    ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
-    ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3
+    ; GFX908-LABEL: name: a2_to_a2_kill
+    ; GFX908: liveins: $agpr0_agpr1
+    ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1
+    ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr1_agpr2
+    ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1
+    ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
+    ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3
+    ; GFX90A-LABEL: name: a2_to_a2_kill
+    ; GFX90A: liveins: $agpr0_agpr1
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1
+    ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr1_agpr2
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1
+    ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
+    ; GFX90A: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3
     $agpr1_agpr2 = COPY killed $agpr0_agpr1, implicit $exec
     $agpr3 = COPY $agpr2
     S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3
@@ -481,15 +699,24 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $agpr4_agpr5_agpr6
-    ; GCN-LABEL: name: a3_to_a3_nonoverlap_kill
-    ; GCN: liveins: $agpr4_agpr5_agpr6
-    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr4_agpr5_agpr6
-    ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
-    ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr4_agpr5_agpr6
-    ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
-    ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6
-    ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
+    ; GFX908-LABEL: name: a3_to_a3_nonoverlap_kill
+    ; GFX908: liveins: $agpr4_agpr5_agpr6
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr4_agpr5_agpr6
+    ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
+    ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr4_agpr5_agpr6
+    ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
+    ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6
+    ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
+    ; GFX90A-LABEL: name: a3_to_a3_nonoverlap_kill
+    ; GFX90A: liveins: $agpr4_agpr5_agpr6
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr4_agpr5_agpr6
+    ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr4_agpr5_agpr6
+    ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6
+    ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
     $agpr0_agpr1_agpr2 = COPY killed $agpr4_agpr5_agpr6
     S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
 ...
@@ -500,15 +727,24 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $agpr1_agpr2_agpr3
-    ; GCN-LABEL: name: a3_to_a3_overlap_kill
-    ; GCN: liveins: $agpr1_agpr2_agpr3
-    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr1_agpr2_agpr3
-    ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
-    ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr1_agpr2_agpr3
-    ; GCN: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr1_agpr2_agpr3
-    ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec
-    ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1
+    ; GFX908-LABEL: name: a3_to_a3_overlap_kill
+    ; GFX908: liveins: $agpr1_agpr2_agpr3
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr1_agpr2_agpr3
+    ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
+    ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr1_agpr2_agpr3
+    ; GFX908: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr1_agpr2_agpr3
+    ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec
+    ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1
+    ; GFX90A-LABEL: name: a3_to_a3_overlap_kill
+    ; GFX90A: liveins: $agpr1_agpr2_agpr3
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr1_agpr2_agpr3
+    ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
+    ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr1_agpr2_agpr3
+    ; GFX90A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr1_agpr2_agpr3
+    ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1
     $agpr0_agpr1_agpr2 = COPY killed $agpr1_agpr2_agpr3
     $vgpr1 = COPY $agpr1
     S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1
@@ -519,16 +755,26 @@ name:            a4_to_a4
 tracksRegLiveness: true
 body:             |
   bb.0:
-    ; GCN-LABEL: name: a4_to_a4
-    ; GCN: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
-    ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5
-    ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
-    ; GCN: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5
+    ; GFX908-LABEL: name: a4_to_a4
+    ; GFX908: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
+    ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5
+    ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX908: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5
+    ; GFX90A-LABEL: name: a4_to_a4
+    ; GFX90A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5
+    ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
+    ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5
     $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
     $agpr2_agpr3_agpr4_agpr5 = COPY killed $agpr0_agpr1_agpr2_agpr3, implicit $exec
     S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5
@@ -540,16 +786,26 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $agpr0_agpr1_agpr2_agpr3
-    ; GCN-LABEL: name: a4_to_a4_overlap
-    ; GCN: liveins: $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5
-    ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
-    ; GCN: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5
+    ; GFX908-LABEL: name: a4_to_a4_overlap
+    ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5
+    ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX908: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5
+    ; GFX90A-LABEL: name: a4_to_a4_overlap
+    ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5
+    ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
+    ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5
     $agpr2_agpr3_agpr4_agpr5 = COPY $agpr0_agpr1_agpr2_agpr3, implicit $exec
     S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5
 ...
@@ -559,25 +815,44 @@ name:            a8_to_a8
 tracksRegLiveness: true
 body:             |
   bb.0:
-    ; GCN-LABEL: name: a8_to_a8
-    ; GCN: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
-    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; GCN: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; GCN: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; GCN: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
-    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; GCN: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
-    ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; GCN: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; GCN: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
-    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; GCN: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
-    ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; GCN: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908-LABEL: name: a8_to_a8
+    ; GFX908: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX908: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX908: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX908: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX908: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX908: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX908: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX908: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX908: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A-LABEL: name: a8_to_a8
+    ; GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX90A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX90A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX90A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX90A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX90A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX90A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX90A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; GFX90A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
     $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec
     S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
@@ -589,41 +864,76 @@ tracksRegLiveness: true
 body:             |
   bb.0:
 
-    ; GCN-LABEL: name: a16_to_a16
-    ; GCN: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
-    ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
-    ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
-    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
-    ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
-    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
-    ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
-    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
-    ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
-    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
-    ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; GCN: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; GFX908-LABEL: name: a16_to_a16
+    ; GFX908: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
+    ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX908: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; GFX90A-LABEL: name: a16_to_a16
+    ; GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
     $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec
     S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
@@ -638,12 +948,17 @@ body:             |
   bb.0:
     liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254
 
-    ; GCN-LABEL: name: a_to_a_spill
-    ; GCN: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254
-    ; GCN: $agpr1 = IMPLICIT_DEF
-    ; GCN: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
-    ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr0
+    ; GFX908-LABEL: name: a_to_a_spill
+    ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254
+    ; GFX908: $agpr1 = IMPLICIT_DEF
+    ; GFX908: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
+    ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr0
+    ; GFX90A-LABEL: name: a_to_a_spill
+    ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254
+    ; GFX90A: $agpr1 = IMPLICIT_DEF
+    ; GFX90A: $agpr0 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr0
     $agpr1 = IMPLICIT_DEF
     $agpr0 = COPY killed $agpr1, implicit $exec
     S_ENDPGM 0, implicit $agpr0
@@ -656,18 +971,30 @@ body:             |
   bb.0:
     liveins: $agpr0, $sgpr2_sgpr3
 
-    ; GCN-LABEL: name: copy_sgpr_to_agpr_tuple
-    ; GCN: liveins: $agpr0, $sgpr2_sgpr3
-    ; GCN: S_NOP 0, implicit-def dead $sgpr0_sgpr1
-    ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7
-    ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
-    ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX908-LABEL: name: copy_sgpr_to_agpr_tuple
+    ; GFX908: liveins: $agpr0, $sgpr2_sgpr3
+    ; GFX908: S_NOP 0, implicit-def dead $sgpr0_sgpr1
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX90A-LABEL: name: copy_sgpr_to_agpr_tuple
+    ; GFX90A: liveins: $agpr0, $sgpr2_sgpr3
+    ; GFX90A: S_NOP 0, implicit-def dead $sgpr0_sgpr1
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
     S_NOP 0, implicit-def dead $sgpr0_sgpr1
     renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
     S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
@@ -679,18 +1006,30 @@ body:             |
   bb.0:
     liveins: $agpr0, $sgpr2_sgpr3
 
-    ; GCN-LABEL: name: copy_sgpr_to_agpr_tuple_kill
-    ; GCN: liveins: $agpr0, $sgpr2_sgpr3
-    ; GCN: S_NOP 0, implicit-def dead $sgpr0_sgpr1
-    ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7
-    ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
-    ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7
+    ; GFX908-LABEL: name: copy_sgpr_to_agpr_tuple_kill
+    ; GFX908: liveins: $agpr0, $sgpr2_sgpr3
+    ; GFX908: S_NOP 0, implicit-def dead $sgpr0_sgpr1
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7
+    ; GFX90A-LABEL: name: copy_sgpr_to_agpr_tuple_kill
+    ; GFX90A: liveins: $agpr0, $sgpr2_sgpr3
+    ; GFX90A: S_NOP 0, implicit-def dead $sgpr0_sgpr1
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7
     S_NOP 0, implicit-def dead $sgpr0_sgpr1
     renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
     S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7
@@ -703,18 +1042,30 @@ body:             |
   bb.0:
     liveins: $agpr0, $agpr2_agpr3
 
-    ; GCN-LABEL: name: copy_agpr_to_agpr_tuple
-    ; GCN: liveins: $agpr0, $agpr2_agpr3
-    ; GCN: S_NOP 0, implicit-def dead $agpr0_agpr1
-    ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7
-    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
-    ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX908-LABEL: name: copy_agpr_to_agpr_tuple
+    ; GFX908: liveins: $agpr0, $agpr2_agpr3
+    ; GFX908: S_NOP 0, implicit-def dead $agpr0_agpr1
+    ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A-LABEL: name: copy_agpr_to_agpr_tuple
+    ; GFX90A: liveins: $agpr0, $agpr2_agpr3
+    ; GFX90A: S_NOP 0, implicit-def dead $agpr0_agpr1
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3
     S_NOP 0, implicit-def dead $agpr0_agpr1
     renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
     S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3
@@ -727,18 +1078,30 @@ body:             |
   bb.0:
     liveins: $agpr0, $agpr2_agpr3
 
-    ; GCN-LABEL: name: copy_agpr_to_agpr_tuple_kill
-    ; GCN: liveins: $agpr0, $agpr2_agpr3
-    ; GCN: S_NOP 0, implicit-def dead $agpr0_agpr1
-    ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7
-    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
-    ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3
-    ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7
+    ; GFX908-LABEL: name: copy_agpr_to_agpr_tuple_kill
+    ; GFX908: liveins: $agpr0, $agpr2_agpr3
+    ; GFX908: S_NOP 0, implicit-def dead $agpr0_agpr1
+    ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
+    ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3
+    ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
+    ; GFX908: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7
+    ; GFX90A-LABEL: name: copy_agpr_to_agpr_tuple_kill
+    ; GFX90A: liveins: $agpr0, $agpr2_agpr3
+    ; GFX90A: S_NOP 0, implicit-def dead $agpr0_agpr1
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3
+    ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
+    ; GFX90A: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7
     S_NOP 0, implicit-def dead $agpr0_agpr1
     renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable killed $agpr0_agpr1_agpr2_agpr3, implicit $exec
     S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7

diff  --git a/llvm/test/CodeGen/AMDGPU/adjust-writemask-vectorized.ll b/llvm/test/CodeGen/AMDGPU/adjust-writemask-vectorized.ll
new file mode 100644
index 000000000000..c36cb87a42a2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/adjust-writemask-vectorized.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Check that write mask is 0xf.
+
+; GCN-LABEL: {{^}}sample_2d_vectorized_use:
+; GCN: image_sample v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] dmask:0xf
+define amdgpu_ps <4 x float> @sample_2d_vectorized_use(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, <4 x float> %a) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %r = fadd <4 x float> %v, %a
+  ret <4 x float> %r
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)

diff  --git a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll
new file mode 100644
index 000000000000..708be530047b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll
@@ -0,0 +1,206 @@
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+
+; GCN-LABEL: {{^}}func_empty:
+; GCN-NOT: buffer_
+; GCN-NOT: v_accvgpr
+; GCN:     s_setpc_b64
+define void @func_empty() #0 {
+  ret void
+}
+
+; GCN-LABEL: {{^}}func_areg_4:
+; GCN-NOT: buffer_
+; GCN-NOT: v_accvgpr
+; GCN: use agpr3
+; GCN-NOT: buffer_
+; GCN-NOT: v_accvgpr
+; GCN: s_setpc_b64
+define void @func_areg_4() #0 {
+  call void asm sideeffect "; use agpr3", "~{a3}" ()
+  ret void
+}
+
+; GCN-LABEL: {{^}}func_areg_32:
+; GCN-NOT: buffer_
+; GCN-NOT: v_accvgpr
+; GCN: use agpr31
+; GCN-NOT: buffer_
+; GCN-NOT: v_accvgpr
+; GCN: s_setpc_b64
+define void @func_areg_32() #0 {
+  call void asm sideeffect "; use agpr31", "~{a31}" ()
+  ret void
+}
+
+; GCN-LABEL: {{^}}func_areg_33:
+; GFX908-NOT: buffer_
+; GCN-NOT:    v_accvgpr
+; GCN:        use agpr32
+; GFX908-NOT: buffer_
+; GCN-NOT:    v_accvgpr
+; GCN:        s_setpc_b64
+define void @func_areg_33() #0 {
+  call void asm sideeffect "; use agpr32", "~{a32}" ()
+  ret void
+}
+
+; GCN-LABEL: {{^}}func_areg_64:
+; GFX908-NOT: buffer_
+; GCN-NOT:    v_accvgpr
+; GFX90A:     buffer_store_dword a63,
+; GCN:        use agpr63
+; GFX90A:     buffer_load_dword a63,
+; GCN-NOT:    v_accvgpr
+; GCN:        s_setpc_b64
+define void @func_areg_64() #0 {
+  call void asm sideeffect "; use agpr63", "~{a63}" ()
+  ret void
+}
+
+; GCN-LABEL: {{^}}func_areg_31_63:
+; GFX908-NOT: buffer_
+; GCN-NOT:    v_accvgpr
+; GFX90A:     buffer_store_dword a63,
+; GCN:        use agpr31, agpr63
+; GFX90A:     buffer_load_dword a63,
+; GCN-NOT:    buffer_
+; GCN-NOT:    v_accvgpr
+; GCN:        s_setpc_b64
+define void @func_areg_31_63() #0 {
+  call void asm sideeffect "; use agpr31, agpr63", "~{a31},~{a63}" ()
+  ret void
+}
+
+declare void @func_unknown() #0
+
+; GCN-LABEL: {{^}}test_call_empty:
+; GCN-NOT:         buffer_
+; GCN-NOT:         v_accvgpr
+; GCN:             def a[0:31]
+; GFX908-COUNT-8:  v_accvgpr_read_b32
+; GFX90A-NOT:      v_accvgpr
+; GCN-NOT:         buffer_
+; GCN:             s_swappc_b64
+; GCN-NOT:         buffer_
+; GFX90A-NOT:      v_accvgpr
+; GFX908-COUNT-8:  global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}]
+; GFX90A-COUNT-8:  global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}]
+; GCN:             s_endpgm
+define amdgpu_kernel void @test_call_empty() #0 {
+bb:
+  %reg = call <32 x float> asm sideeffect "; def $0", "=a"()
+  call void @func_empty()
+  store volatile <32 x float> %reg, <32 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_areg4:
+; GCN-NOT:         buffer_
+; GCN-NOT:         v_accvgpr
+; GFX908:          def a[0:31]
+; GFX90A:          def a[4:35]
+; GFX908-COUNT-8:  v_accvgpr_read_b32
+; GFX90A-NOT:      v_accvgpr
+; GCN-NOT:         buffer_
+; GCN:             s_swappc_b64
+; GCN-NOT:         buffer_
+; GFX90A-NOT:      v_accvgpr
+; GFX908-COUNT-8:  global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}]
+; GFX90A-COUNT-8:  global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}]
+; GCN:             s_endpgm
+define amdgpu_kernel void @test_call_areg4() #0 {
+bb:
+  %reg = call <32 x float> asm sideeffect "; def $0", "=a"()
+  call void @func_areg_4()
+  store volatile <32 x float> %reg, <32 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_areg32:
+; GCN-NOT:         buffer_
+; GCN-NOT:         v_accvgpr
+; GFX908:          def a[0:31]
+; GFX90A:          def a[32:63]
+; GFX908-COUNT-8:  v_accvgpr_read_b32
+; GFX90A-NOT:      v_accvgpr
+; GCN-NOT:         buffer_
+; GCN:             s_swappc_b64
+; GCN-NOT:         buffer_
+; GFX90A-NOT:      v_accvgpr
+; GFX908-COUNT-8:  global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}]
+; GFX90A-COUNT-8:  global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}]
+; GCN:             s_endpgm
+define amdgpu_kernel void @test_call_areg32() #0 {
+bb:
+  %reg = call <32 x float> asm sideeffect "; def $0", "=a"()
+  call void @func_areg_32()
+  store volatile <32 x float> %reg, <32 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_areg64:
+; GCN-NOT:         buffer_
+; GCN-NOT:         v_accvgpr
+; GCN:             def a[0:31]
+; GFX908-COUNT-8:  v_accvgpr_read_b32
+; GFX90A-NOT:      v_accvgpr
+; GCN-NOT:         buffer_
+; GCN:             s_swappc_b64
+; GCN-NOT:         buffer_
+; GFX90A-NOT:      v_accvgpr
+; GFX908-COUNT-8:  global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}]
+; GFX90A-COUNT-8:  global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}]
+; GCN:             s_endpgm
+define amdgpu_kernel void @test_call_areg64() #0 {
+bb:
+  %reg = call <32 x float> asm sideeffect "; def $0", "=a"()
+  call void @func_areg_64()
+  store volatile <32 x float> %reg, <32 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_areg31_63:
+; GCN-NOT:         buffer_
+; GCN-NOT:         v_accvgpr
+; GFX908:          def a[0:31]
+; GFX90A:          def a[32:63]
+; GFX908-COUNT-8:  v_accvgpr_read_b32
+; GFX90A-NOT:      v_accvgpr
+; GCN-NOT:         buffer_
+; GCN:             s_swappc_b64
+; GCN-NOT:         buffer_
+; GFX90A-NOT:      v_accvgpr
+; GFX908-COUNT-8:  global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}]
+; GFX90A-COUNT-8:  global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}]
+; GCN:             s_endpgm
+define amdgpu_kernel void @test_call_areg31_63() #0 {
+bb:
+  %reg = call <32 x float> asm sideeffect "; def $0", "=a"()
+  call void @func_areg_31_63()
+  store volatile <32 x float> %reg, <32 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_unknown:
+; GCN-NOT:         buffer_
+; GCN-NOT:         v_accvgpr
+; GFX908:          def a[0:31]
+; GFX90A:          def a[32:63]
+; GFX908-COUNT-8:  v_accvgpr_read_b32
+; GFX90A-NOT:      v_accvgpr
+; GCN-NOT:         buffer_
+; GCN:             s_swappc_b64
+; GCN-NOT:         buffer_
+; GFX90A-NOT:      v_accvgpr
+; GFX908-COUNT-8:  global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}]
+; GFX90A-COUNT-8:  global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}]
+; GCN:             s_endpgm
+define amdgpu_kernel void @test_call_unknown() #0 {
+bb:
+  %reg = call <32 x float> asm sideeffect "; def $0", "=a"()
+  call void @func_unknown()
+  store volatile <32 x float> %reg, <32 x float> addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" }

diff  --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
index 8ad231c4c910..fe031096479a 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
@@ -1,14 +1,22 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX90A %s
 
 ; GCN-LABEL: {{^}}kernel_32_agprs:
-; GCN:    .amdhsa_next_free_vgpr 32
+; GFX908: .amdhsa_next_free_vgpr 32
+; GFX90A: .amdhsa_next_free_vgpr 44
+; GFX90A: .amdhsa_accum_offset 12
 ; GCN:    NumVgprs: 9
 ; GCN:    NumAgprs: 32
-; GCN:    TotalNumVgprs: 32
-; GCN:    VGPRBlocks: 7
-; GCN:    NumVGPRsForWavesPerEU: 32
+; GFX908: TotalNumVgprs: 32
+; GFX90A: TotalNumVgprs: 44
+; GFX908: VGPRBlocks: 7
+; GFX90A: VGPRBlocks: 5
+; GFX908: NumVGPRsForWavesPerEU: 32
+; GFX90A: NumVGPRsForWavesPerEU: 44
+; GFX90A: AccumOffset: 12
 ; GCN:    Occupancy: 8
-define amdgpu_kernel void @kernel_32_agprs() {
+; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 2
+define amdgpu_kernel void @kernel_32_agprs() #0 {
 bb:
   call void asm sideeffect "", "~{v8}" ()
   call void asm sideeffect "", "~{a31}" ()
@@ -17,27 +25,39 @@ bb:
 
 ; GCN-LABEL: {{^}}kernel_0_agprs:
 ; GCN:    .amdhsa_next_free_vgpr 1
+; GFX90A: .amdhsa_accum_offset 4
 ; GCN:    NumVgprs: 1
 ; GCN:    NumAgprs: 0
 ; GCN:    TotalNumVgprs: 1
 ; GCN:    VGPRBlocks: 0
 ; GCN:    NumVGPRsForWavesPerEU: 1
-; GCN:    Occupancy: 10
-define amdgpu_kernel void @kernel_0_agprs() {
+; GFX90A: AccumOffset: 4
+; GFX908: Occupancy: 10
+; GFX90A: Occupancy: 8
+; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 0
+define amdgpu_kernel void @kernel_0_agprs() #0 {
 bb:
   call void asm sideeffect "", "~{v0}" ()
   ret void
 }
 
 ; GCN-LABEL: {{^}}kernel_40_vgprs:
-; GCN:    .amdhsa_next_free_vgpr 40
+; GFX908: .amdhsa_next_free_vgpr 40
+; GFX90A: .amdhsa_next_free_vgpr 56
+; GFX90A: .amdhsa_accum_offset 40
 ; GCN:    NumVgprs: 40
 ; GCN:    NumAgprs: 16
-; GCN:    TotalNumVgprs: 40
-; GCN:    VGPRBlocks: 9
-; GCN:    NumVGPRsForWavesPerEU: 40
-; GCN:    Occupancy: 6
-define amdgpu_kernel void @kernel_40_vgprs() {
+; GFX908: TotalNumVgprs: 40
+; GFX90A: TotalNumVgprs: 56
+; GFX908: VGPRBlocks: 9
+; GFX90A: VGPRBlocks: 6
+; GFX908: NumVGPRsForWavesPerEU: 40
+; GFX90A: NumVGPRsForWavesPerEU: 56
+; GFX90A: AccumOffset: 40
+; GFX908: Occupancy: 6
+; GFX90A: Occupancy: 8
+; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 9
+define amdgpu_kernel void @kernel_40_vgprs() #0 {
 bb:
   call void asm sideeffect "", "~{v39}" ()
   call void asm sideeffect "", "~{a15}" ()
@@ -47,7 +67,8 @@ bb:
 ; GCN-LABEL: {{^}}func_32_agprs:
 ; GCN:    NumVgprs: 9
 ; GCN:    NumAgprs: 32
-; GCN:    TotalNumVgprs: 32
+; GFX908: TotalNumVgprs: 32
+; GFX90A: TotalNumVgprs: 44
 define void @func_32_agprs() #0 {
 bb:
   call void asm sideeffect "", "~{v8}" ()
@@ -58,8 +79,9 @@ bb:
 ; GCN-LABEL: {{^}}func_32_vgprs:
 ; GCN:    NumVgprs: 32
 ; GCN:    NumAgprs: 9
-; GCN:    TotalNumVgprs: 32
-define void @func_32_vgprs() {
+; GFX908: TotalNumVgprs: 32
+; GFX90A: TotalNumVgprs: 41
+define void @func_32_vgprs() #0 {
 bb:
   call void asm sideeffect "", "~{v31}" ()
   call void asm sideeffect "", "~{a8}" ()
@@ -70,21 +92,28 @@ bb:
 ; GCN:    NumVgprs: 1
 ; GCN:    NumAgprs: 0
 ; GCN:    TotalNumVgprs: 1
-define amdgpu_kernel void @func_0_agprs() {
+define amdgpu_kernel void @func_0_agprs() #0 {
 bb:
   call void asm sideeffect "", "~{v0}" ()
   ret void
 }
 
 ; GCN-LABEL: {{^}}kernel_max_gprs:
-; GCN:    .amdhsa_next_free_vgpr 256
+; GFX908: .amdhsa_next_free_vgpr 256
+; GFX90A: .amdhsa_next_free_vgpr 512
+; GFX90A: .amdhsa_accum_offset 256
 ; GCN:    NumVgprs: 256
 ; GCN:    NumAgprs: 256
-; GCN:    TotalNumVgprs: 256
-; GCN:    VGPRBlocks: 63
-; GCN:    NumVGPRsForWavesPerEU: 256
+; GFX908: TotalNumVgprs: 256
+; GFX90A: TotalNumVgprs: 512
+; GFX908: VGPRBlocks: 63
+; GFX90A: VGPRBlocks: 63
+; GFX908: NumVGPRsForWavesPerEU: 256
+; GFX90A: NumVGPRsForWavesPerEU: 512
+; GFX90A: AccumOffset: 256
 ; GCN:    Occupancy: 1
-define amdgpu_kernel void @kernel_max_gprs() {
+; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 63
+define amdgpu_kernel void @kernel_max_gprs() #0 {
 bb:
   call void asm sideeffect "", "~{v255}" ()
   call void asm sideeffect "", "~{a255}" ()
@@ -92,14 +121,20 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}kernel_call_func_32_agprs:
-; GCN:    .amdhsa_next_free_vgpr 32
+; GFX908: .amdhsa_next_free_vgpr 32
+; GFX90A: .amdhsa_accum_offset 12
 ; GCN:    NumVgprs: 9
 ; GCN:    NumAgprs: 32
-; GCN:    TotalNumVgprs: 32
-; GCN:    VGPRBlocks: 7
-; GCN:    NumVGPRsForWavesPerEU: 32
+; GFX908: TotalNumVgprs: 32
+; GFX90A: TotalNumVgprs: 44
+; GFX908: VGPRBlocks: 7
+; GFX90A: VGPRBlocks: 5
+; GFX908: NumVGPRsForWavesPerEU: 32
+; GFX90A: NumVGPRsForWavesPerEU: 44
+; GFX90A: AccumOffset: 12
 ; GCN:    Occupancy: 8
-define amdgpu_kernel void @kernel_call_func_32_agprs() {
+; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 2
+define amdgpu_kernel void @kernel_call_func_32_agprs() #0 {
 bb:
   call void @func_32_agprs() #0
   ret void
@@ -108,8 +143,9 @@ bb:
 ; GCN-LABEL: {{^}}func_call_func_32_agprs:
 ; GCN:    NumVgprs: 9
 ; GCN:    NumAgprs: 32
-; GCN:    TotalNumVgprs: 32
-define void @func_call_func_32_agprs() {
+; GFX908: TotalNumVgprs: 32
+; GFX90A: TotalNumVgprs: 44
+define void @func_call_func_32_agprs() #0 {
 bb:
   call void @func_32_agprs() #0
   ret void
@@ -118,17 +154,25 @@ bb:
 declare void @undef_func()
 
 ; GCN-LABEL: {{^}}kernel_call_undef_func:
-; GCN:    .amdhsa_next_free_vgpr 24
+; GFX908: .amdhsa_next_free_vgpr 24
+; GFX90A: .amdhsa_next_free_vgpr 48
+; GFX90A: .amdhsa_accum_offset 24
 ; GCN:    NumVgprs: 24
 ; GCN:    NumAgprs: 24
-; GCN:    TotalNumVgprs: 24
-; GCN:    VGPRBlocks: 5
-; GCN:    NumVGPRsForWavesPerEU: 24
-; GCN:    Occupancy: 10
-define amdgpu_kernel void @kernel_call_undef_func() {
+; GFX908: TotalNumVgprs: 24
+; GFX90A: TotalNumVgprs: 48
+; GFX908: VGPRBlocks: 5
+; GFX90A: VGPRBlocks: 5
+; GFX908: NumVGPRsForWavesPerEU: 24
+; GFX90A: NumVGPRsForWavesPerEU: 48
+; GFX90A: AccumOffset: 24
+; GFX908: Occupancy: 10
+; GFX90A: Occupancy: 8
+; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 5
+define amdgpu_kernel void @kernel_call_undef_func() #0 {
 bb:
   call void @undef_func()
   ret void
 }
 
-attributes #0 = { nounwind noinline }
+attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" }

diff  --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
index 6b159e0bc746..da32323d6744 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
@@ -1,6 +1,7 @@
 ; -enable-misched=false makes the register usage more predictable
 ; -regalloc=fast just makes the test run faster
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX9
+; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX90A
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE32
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE64
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+cumode -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE32
@@ -524,6 +525,9 @@ define internal void @use256vgprs() {
 
 ; GCN-LABEL: {{^}}f256:
 ; GFX9: NumVgprs: 256
+; GFX90A: NumVgprs: 256
+; GFX90A: NumAgprs: 0
+; GFX90A: TotalNumVgprs: 256
 ; GFX10WGP-WAVE32: NumVgprs: 256
 ; GFX10WGP-WAVE64: NumVgprs: 256
 ; GFX10CU-WAVE32: NumVgprs: 256
@@ -536,6 +540,9 @@ attributes #256 = { nounwind "amdgpu-flat-work-group-size"="256,256" }
 
 ; GCN-LABEL: {{^}}f512:
 ; GFX9: NumVgprs: 128
+; GFX90A: NumVgprs: 128
+; GFX90A: NumAgprs: 128
+; GFX90A: TotalNumVgprs: 256
 ; GFX10WGP-WAVE32: NumVgprs: 256
 ; GFX10WGP-WAVE64: NumVgprs: 256
 ; GFX10CU-WAVE32: NumVgprs: 128
@@ -548,6 +555,9 @@ attributes #512 = { nounwind "amdgpu-flat-work-group-size"="512,512" }
 
 ; GCN-LABEL: {{^}}f1024:
 ; GFX9: NumVgprs: 64
+; GFX90A: NumVgprs: 64
+; GFX90A: NumAgprs: 64
+; GFX90A: TotalNumVgprs: 128
 ; GFX10WGP-WAVE32: NumVgprs: 128
 ; GFX10WGP-WAVE64: NumVgprs: 128
 ; GFX10CU-WAVE32: NumVgprs: 64

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir b/llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir
index a1029f2cf690..7a286bf3aef9 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir
@@ -90,7 +90,7 @@ body:             |
     DBG_VALUE renamable $sgpr6_sgpr7, $noreg, !11, !DIExpression(DW_OP_plus_uconst, 12, DW_OP_stack_value), debug-location !12
     $vgpr1 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr6_sgpr7
     $vgpr2 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit killed $sgpr6_sgpr7, implicit $exec
-    GLOBAL_STORE_DWORD killed renamable $vgpr1_vgpr2, renamable $vgpr0, 12, 0, 0, 0, implicit $exec, debug-location !12 :: (store 4 into %ir.tmp2, addrspace 1)
+    GLOBAL_STORE_DWORD killed renamable $vgpr1_vgpr2, renamable $vgpr0, 12, 0, 0, 0, 0, implicit $exec, debug-location !12 :: (store 4 into %ir.tmp2, addrspace 1)
     renamable $sgpr4 = S_MOV_B32 8388608
     renamable $sgpr4_sgpr5 = nofpexcept V_CMP_GT_F32_e64 0, killed $sgpr4, 0, killed $vgpr0, 0, implicit $mode, implicit $exec
     renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
@@ -102,7 +102,7 @@ body:             |
     renamable $sgpr4_sgpr5 = IMPLICIT_DEF
     $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr4_sgpr5
     $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit killed $sgpr4_sgpr5, implicit $exec
-    renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`, addrspace 1)
+    renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`, addrspace 1)
     renamable $sgpr4 = S_MOV_B32 2139095040
     S_WAITCNT 3952
     renamable $sgpr4_sgpr5 = nofpexcept V_CMP_NEQ_F32_e64 0, killed $sgpr4, 0, killed $vgpr0, 0, implicit $mode, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir b/llvm/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir
index bc172a33d512..acc47a065cc0 100644
--- a/llvm/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir
+++ b/llvm/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir
@@ -326,11 +326,11 @@ body: |
   bb.0:
     ; GCN-LABEL: name: flat_inst_breaks_smem_clause
     ; GCN: $sgpr0 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0
-    ; GCN-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr12_sgpr13, 0, 0, 0
     ; GCN-NEXT: S_ENDPGM 0
     $sgpr0 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr12_sgpr13, 0, 0, 0
     S_ENDPGM 0
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir b/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir
index f8f623c3a928..5def5f9382a2 100644
--- a/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir
+++ b/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir
@@ -9,10 +9,10 @@ name: trivial_clause_load_flat4_x1
 body: |
   bb.0:
     ; GCN-LABEL: name: trivial_clause_load_flat4_x1
-    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -22,12 +22,12 @@ name: trivial_clause_load_flat4_x2
 body: |
   bb.0:
     ; GCN-LABEL: name: trivial_clause_load_flat4_x2
-    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr1 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr1 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr1 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr1 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -37,14 +37,14 @@ name: trivial_clause_load_flat4_x3
 body: |
   bb.0:
     ; GCN-LABEL: name: trivial_clause_load_flat4_x3
-    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr1 = FLAT_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr7_vgpr8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr1 = FLAT_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr1 = FLAT_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr2 = FLAT_LOAD_DWORD $vgpr7_vgpr8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr1 = FLAT_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr2 = FLAT_LOAD_DWORD $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -54,16 +54,16 @@ name: trivial_clause_load_flat4_x4
 body: |
   bb.0:
     ; GCN-LABEL: name: trivial_clause_load_flat4_x4
-    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr1 = FLAT_LOAD_DWORD $vgpr6_vgpr7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr8_vgpr9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr1 = FLAT_LOAD_DWORD $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr10_vgpr11, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr1 = FLAT_LOAD_DWORD $vgpr6_vgpr7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr2 = FLAT_LOAD_DWORD $vgpr8_vgpr9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr3 = FLAT_LOAD_DWORD $vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr1 = FLAT_LOAD_DWORD $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr2 = FLAT_LOAD_DWORD $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr3 = FLAT_LOAD_DWORD $vgpr10_vgpr11, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -73,12 +73,12 @@ name: trivial_clause_load_flat4_x2_sameptr
 body: |
   bb.0:
     ; GCN-LABEL: name: trivial_clause_load_flat4_x2_sameptr
-    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr1 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr1 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr1 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr1 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -88,10 +88,10 @@ name: flat_load4_overwrite_ptr_lo
 body: |
   bb.0:
     ; GCN-LABEL: name: flat_load4_overwrite_ptr_lo
-    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -101,10 +101,10 @@ name: flat_load4_overwrite_ptr_hi
 body: |
   bb.0:
     ; GCN-LABEL: name: flat_load4_overwrite_ptr_hi
-    ; GCN: $vgpr1 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: $vgpr1 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr1 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr1 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -114,10 +114,10 @@ name: flat_load8_overwrite_ptr
 body: |
   bb.0:
     ; GCN-LABEL: name: flat_load8_overwrite_ptr
-    ; GCN: $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -130,48 +130,48 @@ name: break_clause_at_max_clause_size_flat_load4
 body: |
   bb.0:
     ; GCN-LABEL: name: break_clause_at_max_clause_size_flat_load4
-    ; GCN: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr5 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr6 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr7 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr8 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr9 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr10 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr11 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr12 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr13 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr14 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr15 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr16 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr17 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr5 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr6 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr7 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr8 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr9 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr10 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr11 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr12 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr13 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr14 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr15 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr16 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr17 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; XNACK-NEXT: S_NOP 0
-    ; GCN-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr5 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr5 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
-    $vgpr6 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr7 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr8 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr9 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr6 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr7 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr8 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr9 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
-    $vgpr10 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr11 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr12 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr13 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr10 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr11 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr12 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr13 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
-    $vgpr14 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr15 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr16 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr17 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr14 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr15 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr16 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr17 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     $sgpr0 = S_MOV_B32 $sgpr0, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18
     S_ENDPGM 0
 ...
@@ -182,13 +182,13 @@ name: break_clause_simple_load_flat4_lo_ptr
 body: |
   bb.0:
     ; GCN-LABEL: name: break_clause_simple_load_flat4_lo_ptr
-    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; XNACK-NEXT: S_NOP 0
-    ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -198,13 +198,13 @@ name: break_clause_simple_load_flat4_hi_ptr
 body: |
   bb.0:
     ; GCN-LABEL: name: break_clause_simple_load_flat4_hi_ptr
-    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; XNACK-NEXT: S_NOP 0
-    ; GCN-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr3 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr3 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -214,13 +214,13 @@ name: break_clause_simple_load_flat8_ptr
 body: |
   bb.0:
     ; GCN-LABEL: name: break_clause_simple_load_flat8_ptr
-    ; GCN: $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; XNACK-NEXT: S_NOP 0
-    ; GCN-NEXT: $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -231,12 +231,12 @@ name: break_clause_simple_load_flat16_ptr
 body: |
   bb.0:
     ; GCN-LABEL: name: break_clause_simple_load_flat16_ptr
-    ; GCN: $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; XNACK-NEXT: S_NOP 0
-    ; GCN-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5 = FLAT_LOAD_DWORDX4 $vgpr6_vgpr7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5 = FLAT_LOAD_DWORDX4 $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: S_ENDPGM 0
-    $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr2_vgpr3_vgpr4_vgpr5 = FLAT_LOAD_DWORDX4 $vgpr6_vgpr7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr2_vgpr3_vgpr4_vgpr5 = FLAT_LOAD_DWORDX4 $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -251,17 +251,17 @@ body: |
   ; GCN-LABEL: name: break_clause_block_boundary_load_flat8_ptr
   ; GCN: bb.0:
   ; GCN-NEXT:   successors: %bb.1(0x80000000)
-  ; GCN:   $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN:   $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
   ; GCN: bb.1:
   ; XNACK-NEXT:  S_NOP 0
-  ; GCN-NEXT:   $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN-NEXT:   $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
   ; GCN-NEXT:   S_ENDPGM 0
 
   bb.0:
-    $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
   bb.1:
-    $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -272,12 +272,12 @@ name: break_clause_store_load_into_ptr_flat4
 body: |
   bb.0:
     ; GCN-LABEL: name: break_clause_store_load_into_ptr_flat4
-    ; GCN: FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: S_ENDPGM 0
 
-    FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -289,12 +289,12 @@ name: break_clause_store_load_into_data_flat4
 body: |
   bb.0:
     ; GCN-LABEL: name: break_clause_store_load_into_data_flat4
-    ; GCN: FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: S_ENDPGM 0
 
-    FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -305,14 +305,14 @@ name: valu_inst_breaks_clause
 body: |
   bb.0:
     ; GCN-LABEL: name: valu_inst_breaks_clause
-    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     $vgpr8 = V_MOV_B32_e32 0, implicit $exec
-    $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -323,14 +323,14 @@ name: salu_inst_breaks_clause
 body: |
   bb.0:
     ; GCN-LABEL: name: salu_inst_breaks_clause
-    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: $sgpr8 = S_MOV_B32 0
-    ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     $sgpr8 = S_MOV_B32 0
-    $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -340,14 +340,14 @@ name: ds_inst_breaks_clause
 body: |
   bb.0:
     ; GCN-LABEL: name: ds_inst_breaks_clause
-    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: $vgpr8 = DS_READ_B32 $vgpr9, 0, 0, implicit $m0, implicit $exec
-    ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     $vgpr8 = DS_READ_B32 $vgpr9, 0, 0, implicit $m0, implicit $exec
-    $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -357,14 +357,14 @@ name: smrd_inst_breaks_clause
 body: |
   bb.0:
     ; GCN-LABEL: name: smrd_inst_breaks_clause
-    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: $sgpr8 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0
-    ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     $sgpr8 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0
-    $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -374,13 +374,13 @@ name: implicit_use_breaks_clause
 body: |
   bb.0:
     ; GCN-LABEL: name: implicit_use_breaks_clause
-    ; GCN: $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr4_vgpr5
+    ; GCN: $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr4_vgpr5
     ; XNACK-NEXT: S_NOP 0
-    ; GCN-NEXT: $vgpr4_vgpr5 = FLAT_LOAD_DWORDX2 $vgpr6_vgpr7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr4_vgpr5 = FLAT_LOAD_DWORDX2 $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr4_vgpr5
-    $vgpr4_vgpr5 = FLAT_LOAD_DWORDX2 $vgpr6_vgpr7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr4_vgpr5
+    $vgpr4_vgpr5 = FLAT_LOAD_DWORDX2 $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -389,12 +389,12 @@ name: trivial_clause_load_mubuf4_x2
 body: |
   bb.0:
     ; GCN-LABEL: name: trivial_clause_load_mubuf4_x2
-    ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 ---
@@ -403,13 +403,13 @@ name: break_clause_simple_load_mubuf_offen_ptr
 body: |
   bb.0:
     ; GCN-LABEL: name: break_clause_simple_load_mubuf_offen_ptr
-    ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; XNACK-NEXT: S_NOP 0
-    ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 ---
@@ -420,11 +420,11 @@ name: mubuf_load4_overwrite_ptr
 body: |
   bb.0:
     ; GCN-LABEL: name: mubuf_load4_overwrite_ptr
-    ; GCN: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
     ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
     ; GCN-NEXT: S_ENDPGM 0
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr1 = V_MOV_B32_e32 0, implicit $exec
     $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
     S_ENDPGM 0
@@ -437,13 +437,13 @@ name: break_clause_flat_load_mubuf_load
 body: |
   bb.0:
     ; GCN-LABEL: name: break_clause_flat_load_mubuf_load
-    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; XNACK-NEXT: S_NOP 0
-    ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 # Break a clause from interference between mubuf and flat instructions
@@ -458,8 +458,8 @@ name: break_clause_mubuf_load_flat_load
 
 body: |
   bb.0:
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr1 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     S_ENDPGM 0
 ...
@@ -470,13 +470,13 @@ name: break_clause_atomic_rtn_into_ptr_flat4
 body: |
   bb.0:
     ; GCN-LABEL: name: break_clause_atomic_rtn_into_ptr_flat4
-    ; GCN: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; XNACK-NEXT: S_NOP 0
-    ; GCN-NEXT: $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr5_vgpr6, $vgpr7, 0, 1, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr5_vgpr6, $vgpr7, 0, 1, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr5_vgpr6, $vgpr7, 0, 1, 0, implicit $exec, implicit $flat_scr
+    $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr5_vgpr6, $vgpr7, 0, 1, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -485,12 +485,12 @@ name: break_clause_atomic_nortn_ptr_load_flat4
 body: |
   bb.0:
     ; GCN-LABEL: name: break_clause_atomic_nortn_ptr_load_flat4
-    ; GCN: FLAT_ATOMIC_ADD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: FLAT_ATOMIC_ADD $vgpr0_vgpr1, $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: S_ENDPGM 0
 
-    FLAT_ATOMIC_ADD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr2 = FLAT_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_ATOMIC_ADD $vgpr0_vgpr1, $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr2 = FLAT_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -500,12 +500,12 @@ name: break_clause_atomic_rtn_into_ptr_mubuf4
 body: |
   bb.0:
     ; GCN-LABEL: name: break_clause_atomic_rtn_into_ptr_mubuf4
-    ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; XNACK-NEXT: S_NOP 0
     ; GCN-NEXT: $vgpr2 = BUFFER_ATOMIC_ADD_OFFEN_RTN $vgpr2, $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 1, 0, implicit $exec
     ; GCN-NEXT: S_ENDPGM 0
 
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr2 = BUFFER_ATOMIC_ADD_OFFEN_RTN $vgpr2, $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 1, 0, implicit $exec
     S_ENDPGM 0
 ...
@@ -517,11 +517,11 @@ body: |
   bb.0:
     ; GCN-LABEL: name: break_clause_atomic_nortn_ptr_load_mubuf4
     ; GCN: BUFFER_ATOMIC_ADD_OFFEN $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, implicit $exec
-    ; GCN-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; GCN-NEXT: S_ENDPGM 0
 
     BUFFER_ATOMIC_ADD_OFFEN $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, implicit $exec
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 ---
@@ -532,11 +532,11 @@ name: no_break_clause_mubuf_load_novaddr
 body: |
   bb.0:
     ; GCN-LABEL: name: no_break_clause_mubuf_load_novaddr
-    ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; GCN-NEXT: S_ENDPGM 0
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 ---
@@ -546,16 +546,16 @@ name: mix_load_store_clause
 body: |
   bb.0:
     ; GCN-LABEL: name: mix_load_store_clause
-    ; GCN: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr10 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr10 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; XNACK-NEXT: S_NOP 0
-    ; GCN-NEXT: FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr6, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr11 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr6, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr11 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
-    FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr10 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr6, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr11 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr10 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr6, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr11 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -565,15 +565,15 @@ name: mix_load_store_clause_same_address
 body: |
   bb.0:
     ; GCN-LABEL: name: mix_load_store_clause_same_address
-    ; GCN: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr10 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr10 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     ; XNACK-NEXT: S_NOP 0
-    ; GCN-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    ; GCN-NEXT: $vgpr11 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr11 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
-    FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr10 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr11 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr10 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr11 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
index ac69c2dca6b8..eea193145b62 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
@@ -10,26 +10,26 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
   ; GCN:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GCN:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
   ; GCN:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM killed [[REG_SEQUENCE]], 0, 0, 0 :: (dereferenceable invariant load 16 from %ir.arg0, addrspace 6)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 16, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 16, align 1, addrspace 4)
   ; GCN:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 32, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 32, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 48, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 48, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 64, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 64, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 80, align 1, addrspace 4)
   ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
@@ -39,16 +39,20 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
   ; GCN:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1
   ; GCN:   [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 96, align 1, addrspace 4)
-  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
+  ; GCN:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; GCN:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0
   ; GCN:   [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF1]].sub0
+  ; GCN:   [[COPY4:%[0-9]+]]:vreg_64 = COPY [[DEF1]]
+  ; GCN:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0
   ; GCN:   [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF2]].sub0
+  ; GCN:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[DEF2]]
+  ; GCN:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]].sub0
   ; GCN:   [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF3]].sub0
+  ; GCN:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF3]]
+  ; GCN:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
   ; GCN:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
   ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 112, align 1, addrspace 4)
@@ -56,23 +60,23 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
   ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 128, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 128, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 64
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 128, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 128, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 128
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 128, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 128, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[COPY]]
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY10]], 128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 144, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 144, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 72
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 144, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 144, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 144
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 144, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 144, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[COPY]]
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY11]], 144, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 160, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 80
@@ -80,148 +84,160 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
   ; GCN:   [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 160
   ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 160, align 1, addrspace 4)
   ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY]]
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY12]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 176, align 1, addrspace 4)
-  ; GCN:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[DEF4]].sub0
+  ; GCN:   [[COPY13:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+  ; GCN:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY13]].sub0
   ; GCN:   [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 88
   ; GCN:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_7]], 88, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 176, align 1, addrspace 4)
-  ; GCN:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[DEF5]].sub0
+  ; GCN:   [[COPY15:%[0-9]+]]:vreg_64 = COPY [[DEF5]]
+  ; GCN:   [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY15]].sub0
   ; GCN:   [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 176
   ; GCN:   [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 176, align 1, addrspace 4)
-  ; GCN:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[DEF6]].sub0
+  ; GCN:   [[COPY17:%[0-9]+]]:vreg_64 = COPY [[DEF6]]
+  ; GCN:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0
   ; GCN:   [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[DEF7]].sub0
-  ; GCN:   [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY]]
+  ; GCN:   [[COPY19:%[0-9]+]]:vreg_64 = COPY [[DEF7]]
+  ; GCN:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0
+  ; GCN:   [[COPY21:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY13]], 176, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[DEF8]].sub0
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 176, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[COPY22:%[0-9]+]]:vreg_64 = COPY [[DEF8]]
+  ; GCN:   [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[COPY22]].sub0
   ; GCN:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 192, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 192, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 96
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 192, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 192, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 192
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 192, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY15]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 192, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[COPY24:%[0-9]+]]:sreg_32 = COPY [[COPY]]
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY24]], 192, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 208, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 208, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 104
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 208, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 208, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 208
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 208, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 208, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[COPY25:%[0-9]+]]:sreg_32 = COPY [[COPY]]
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY25]], 208, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY17]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 224, align 1, addrspace 4)
+  ; GCN:   [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY26]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 224, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 112
-  ; GCN:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_13]], 112, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 224, align 1, addrspace 4)
+  ; GCN:   [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY27]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_13]], 112, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 224, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 224
-  ; GCN:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 224, align 1, addrspace 4)
+  ; GCN:   [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 224, align 1, addrspace 4)
   ; GCN:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[COPY21:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   [[COPY30:%[0-9]+]]:sreg_32 = COPY [[COPY]]
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[COPY30]], 224, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY22]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 240, align 1, addrspace 4)
+  ; GCN:   [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY31]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 240, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 120
-  ; GCN:   [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY23]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_15]], 120, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 240, align 1, addrspace 4)
+  ; GCN:   [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY32]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_15]], 120, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 240, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 240
-  ; GCN:   [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY24]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 240, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[COPY26:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[COPY26]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY33]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 240, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   [[COPY35:%[0-9]+]]:sreg_32 = COPY [[COPY]]
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY34]], [[S_LOAD_DWORDX4_IMM]], [[COPY35]], 240, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN:   [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY27]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 256, align 1, addrspace 4)
-  ; GCN:   [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 256, align 1, addrspace 4)
+  ; GCN:   [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY36]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 256, align 1, addrspace 4)
+  ; GCN:   [[COPY37:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY37]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 256, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 256
-  ; GCN:   [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 256, align 1, addrspace 4)
+  ; GCN:   [[COPY38:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY38]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 256, align 1, addrspace 4)
   ; GCN:   BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[COPY31:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[COPY39:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   [[COPY40:%[0-9]+]]:sreg_32 = COPY [[COPY]]
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[COPY40]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN:   [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   [[COPY41:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[DEF9:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY32]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 272, align 1, addrspace 4)
-  ; GCN:   [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[DEF9]].sub0
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY41]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 272, align 1, addrspace 4)
+  ; GCN:   [[COPY42:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+  ; GCN:   [[COPY43:%[0-9]+]]:vgpr_32 = COPY [[COPY42]].sub0
   ; GCN:   [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 136
-  ; GCN:   [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[DEF10:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY34]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 136, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 272, align 1, addrspace 4)
-  ; GCN:   [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[DEF10]].sub0
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 136, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 272, align 1, addrspace 4)
+  ; GCN:   [[COPY45:%[0-9]+]]:vreg_64 = COPY [[DEF10]]
+  ; GCN:   [[COPY46:%[0-9]+]]:vgpr_32 = COPY [[COPY45]].sub0
   ; GCN:   [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 272
-  ; GCN:   [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   [[COPY47:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[DEF11:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY36]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 272, align 1, addrspace 4)
-  ; GCN:   [[COPY37:%[0-9]+]]:vgpr_32 = COPY [[DEF11]].sub0
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 272, align 1, addrspace 4)
+  ; GCN:   [[COPY48:%[0-9]+]]:vreg_64 = COPY [[DEF11]]
+  ; GCN:   [[COPY49:%[0-9]+]]:vgpr_32 = COPY [[COPY48]].sub0
   ; GCN:   [[DEF12:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; GCN:   BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[COPY38:%[0-9]+]]:vgpr_32 = COPY [[DEF12]].sub0
-  ; GCN:   [[COPY39:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[COPY40:%[0-9]+]]:sreg_32 = COPY [[COPY]]
+  ; GCN:   [[COPY50:%[0-9]+]]:vreg_64 = COPY [[DEF12]]
+  ; GCN:   [[COPY51:%[0-9]+]]:vgpr_32 = COPY [[COPY50]].sub0
+  ; GCN:   [[COPY52:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   [[COPY53:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN:   [[DEF13:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[COPY40]], 272, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[COPY41:%[0-9]+]]:vgpr_32 = COPY [[DEF13]].sub0
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY52]], [[S_LOAD_DWORDX4_IMM]], [[COPY53]], 272, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[COPY54:%[0-9]+]]:vreg_64 = COPY [[DEF13]]
+  ; GCN:   [[COPY55:%[0-9]+]]:vgpr_32 = COPY [[COPY54]].sub0
   ; GCN:   [[DEF14:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[COPY42:%[0-9]+]]:vgpr_32 = COPY [[DEF14]].sub0
+  ; GCN:   [[COPY56:%[0-9]+]]:vreg_64 = COPY [[DEF14]]
+  ; GCN:   [[COPY57:%[0-9]+]]:vgpr_32 = COPY [[COPY56]].sub0
   ; GCN:   [[DEF15:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[COPY43:%[0-9]+]]:vgpr_32 = COPY [[DEF15]].sub0
+  ; GCN:   [[COPY58:%[0-9]+]]:vreg_64 = COPY [[DEF15]]
+  ; GCN:   [[COPY59:%[0-9]+]]:vgpr_32 = COPY [[COPY58]].sub0
   ; GCN:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN:   [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 288, align 1, addrspace 4)
-  ; GCN:   [[COPY45:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY45]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 288, align 1, addrspace 4)
+  ; GCN:   [[COPY60:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY60]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 288, align 1, addrspace 4)
+  ; GCN:   [[COPY61:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY61]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 288, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 288
-  ; GCN:   [[COPY46:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY46]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 288, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[COPY47:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[COPY48:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[COPY48]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[COPY62:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY62]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 288, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[COPY63:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   [[COPY64:%[0-9]+]]:sreg_32 = COPY [[COPY]]
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY63]], [[S_LOAD_DWORDX4_IMM]], [[COPY64]], 288, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN:   [[COPY49:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 304, align 1, addrspace 4)
+  ; GCN:   [[COPY65:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY65]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 304, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 152
-  ; GCN:   [[COPY50:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY50]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_21]], 152, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 304, align 1, addrspace 4)
+  ; GCN:   [[COPY66:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY66]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_21]], 152, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 304, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 304
-  ; GCN:   [[COPY51:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY51]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 304, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   [[COPY52:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[COPY53:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY52]], [[S_LOAD_DWORDX4_IMM]], [[COPY53]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[COPY67:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY67]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 304, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[COPY68:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GCN:   [[COPY69:%[0-9]+]]:sreg_32 = COPY [[COPY]]
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY68]], [[S_LOAD_DWORDX4_IMM]], [[COPY69]], 304, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
   ; GCN:   S_ENDPGM 0
 bb.0:
   %tmp0 = load <4 x i32>, <4 x i32> addrspace(6)* %arg0, align 16, !invariant.load !0

diff  --git a/llvm/test/CodeGen/AMDGPU/bundle-latency.mir b/llvm/test/CodeGen/AMDGPU/bundle-latency.mir
index 2bb21dec55a2..7937b45fb639 100644
--- a/llvm/test/CodeGen/AMDGPU/bundle-latency.mir
+++ b/llvm/test/CodeGen/AMDGPU/bundle-latency.mir
@@ -10,14 +10,14 @@ body:             |
   bb.0:
     ; GCN-LABEL: name: src_bundle_latency
     ; GCN: $vgpr0, $vgpr1 = BUNDLE undef $vgpr3_vgpr4, implicit $exec {
-    ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
-    ; GCN:   $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, 0, implicit $exec
     ; GCN: }
     ; GCN: $vgpr6 = V_ADD_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
     ; GCN: $vgpr5 = V_ADD_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
     $vgpr0, $vgpr1 = BUNDLE undef $vgpr3_vgpr4, implicit $exec {
-      $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
-      $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec
+      $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
+      $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, 0, implicit $exec
     }
     $vgpr5 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     $vgpr6 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
@@ -32,13 +32,13 @@ body:             |
     ; GCN: $vgpr1 = V_ADD_F32_e32 undef $vgpr6, undef $vgpr6, implicit $mode, implicit $exec
     ; GCN: $vgpr0 = V_ADD_F32_e32 undef $vgpr5, undef $vgpr5, implicit $mode, implicit $exec
     ; GCN: BUNDLE killed $vgpr0, killed $vgpr1, undef $vgpr3_vgpr4, implicit $exec {
-    ; GCN:   GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, killed $vgpr1, 0, 0, 0, 0, implicit $exec
-    ; GCN:   GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, killed $vgpr0, 4, 0, 0, 0, implicit $exec
+    ; GCN:   GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, killed $vgpr1, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN:   GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, killed $vgpr0, 4, 0, 0, 0, 0, implicit $exec
     ; GCN: }
     $vgpr0 = V_ADD_F32_e32 undef $vgpr5, undef $vgpr5, implicit $mode, implicit $exec
     $vgpr1 = V_ADD_F32_e32 undef $vgpr6, undef $vgpr6, implicit $mode, implicit $exec
     BUNDLE $vgpr0, $vgpr1, undef $vgpr3_vgpr4, implicit $exec {
-      GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec
-      GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec
+      GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, 0, implicit $exec
+      GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, 0, implicit $exec
     }
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
new file mode 100644
index 000000000000..b7a1e00a5522
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
@@ -0,0 +1,743 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7,UNPACKED-TID %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-xnack -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,PACKED-TID %s
+
+; GCN-LABEL: {{^}}use_workitem_id_x:
+; GCN: s_waitcnt
+; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0
+; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @use_workitem_id_x() #1 {
+  %val = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_workitem_id_y:
+; GCN: s_waitcnt
+; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10
+; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @use_workitem_id_y() #1 {
+  %val = call i32 @llvm.amdgcn.workitem.id.y()
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_workitem_id_z:
+; GCN: s_waitcnt
+; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10
+; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @use_workitem_id_z() #1 {
+  %val = call i32 @llvm.amdgcn.workitem.id.z()
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_workitem_id_xy:
+; GCN: s_waitcnt
+; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
+; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
+; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
+; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @use_workitem_id_xy() #1 {
+  %val0 = call i32 @llvm.amdgcn.workitem.id.x()
+  %val1 = call i32 @llvm.amdgcn.workitem.id.y()
+  store volatile i32 %val0, i32 addrspace(1)* undef
+  store volatile i32 %val1, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_workitem_id_xyz:
+; GCN: s_waitcnt
+; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
+; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
+; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
+; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
+; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
+; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @use_workitem_id_xyz() #1 {
+  %val0 = call i32 @llvm.amdgcn.workitem.id.x()
+  %val1 = call i32 @llvm.amdgcn.workitem.id.y()
+  %val2 = call i32 @llvm.amdgcn.workitem.id.z()
+  store volatile i32 %val0, i32 addrspace(1)* undef
+  store volatile i32 %val1, i32 addrspace(1)* undef
+  store volatile i32 %val2, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_workitem_id_xz:
+; GCN: s_waitcnt
+; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
+; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
+; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
+; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @use_workitem_id_xz() #1 {
+  %val0 = call i32 @llvm.amdgcn.workitem.id.x()
+  %val1 = call i32 @llvm.amdgcn.workitem.id.z()
+  store volatile i32 %val0, i32 addrspace(1)* undef
+  store volatile i32 %val1, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_workitem_id_yz:
+; GCN: s_waitcnt
+; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
+; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
+; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
+; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @use_workitem_id_yz() #1 {
+  %val0 = call i32 @llvm.amdgcn.workitem.id.y()
+  %val1 = call i32 @llvm.amdgcn.workitem.id.z()
+  store volatile i32 %val0, i32 addrspace(1)* undef
+  store volatile i32 %val1, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x:
+
+; GCN-NOT: v0
+; GCN: s_swappc_b64
+; GCN-NOT: v0
+
+; GCN: .amdhsa_system_vgpr_workitem_id 0
+define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 {
+  call void @use_workitem_id_x()
+  ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_y:
+
+; GCN-NOT: v0
+; GCN-NOT: v1
+; UNPACKED-TID: v_lshlrev_b32_e32 v0, 10, v1
+; UNPACKED-TID-NOT: v0
+; UNPACKED-TID-NOT: v1
+; GCN: s_swappc_b64
+
+; GCN: .amdhsa_system_vgpr_workitem_id 1
+define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 {
+  call void @use_workitem_id_y()
+  ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_z:
+
+; GCN-NOT: v0
+; GCN-NOT: v2
+; UNPACKED-TID: v_lshlrev_b32_e32 v0, 20, v2
+; UNPACKED-TID-NOT: v0
+; UNPACKED-TID-NOT: v1
+; GCN: s_swappc_b64
+
+; GCN: .amdhsa_system_vgpr_workitem_id 2
+define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 {
+  call void @use_workitem_id_z()
+  ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xy:
+; UNPACKED-TID-NOT: v0
+; UNPACKED-TID-NOT: v1
+; UNPACKED-TID: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
+; UNPACKED-TID: v_or_b32_e32 v0, v0, [[IDY]]
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 {
+  call void @use_workitem_id_xy()
+  ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xz:
+; UNPACKED-TID-NOT: v0
+; UNPACKED-TID-NOT: v2
+; UNPACKED-TID: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
+; UNPACKED-TID: v_or_b32_e32 v0, v0, [[IDZ]]
+; GCN-NOT: v0
+; GCN-NOT: v2
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 {
+  call void @use_workitem_id_xz()
+  ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_yz:
+; UNPACKED-TID-NOT: v1
+; UNPACKED-TID-NOT: v2
+; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
+; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
+; UNPACKED-TID: v_or_b32_e32 v0, [[IDY]], [[IDZ]]
+; GCN-NOT: v1
+; GCN-NOT: v2
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 {
+  call void @use_workitem_id_yz()
+  ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xyz:
+; UNPACKED-TID-NOT: v0
+; UNPACKED-TID-NOT: v1
+; UNPACKED-TID-NOT: v2
+; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
+; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
+; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, [[IDY]]
+; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, [[IDZ]]
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN-NOT: v2
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_workitem_id_xyz() #1 {
+  call void @use_workitem_id_xyz()
+  ret void
+}
+
+; GCN-LABEL: {{^}}func_indirect_use_workitem_id_x:
+; GCN-NOT: v0
+; GCN: s_swappc_b64
+; GCN-NOT: v0
+define void @func_indirect_use_workitem_id_x() #1 {
+  call void @use_workitem_id_x()
+  ret void
+}
+
+; GCN-LABEL: {{^}}func_indirect_use_workitem_id_y:
+; GCN-NOT: v0
+; GCN: s_swappc_b64
+; GCN-NOT: v0
+define void @func_indirect_use_workitem_id_y() #1 {
+  call void @use_workitem_id_y()
+  ret void
+}
+
+; GCN-LABEL: {{^}}func_indirect_use_workitem_id_z:
+; GCN-NOT: v0
+; GCN: s_swappc_b64
+; GCN-NOT: v0
+define void @func_indirect_use_workitem_id_z() #1 {
+  call void @use_workitem_id_z()
+  ret void
+}
+
+; GCN-LABEL: {{^}}other_arg_use_workitem_id_x:
+; GCN: s_waitcnt
+; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v1
+; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
+define void @other_arg_use_workitem_id_x(i32 %arg0) #1 {
+  %val = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %arg0, i32 addrspace(1)* undef
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}other_arg_use_workitem_id_y:
+; GCN: s_waitcnt
+; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 10, 10
+; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
+define void @other_arg_use_workitem_id_y(i32 %arg0) #1 {
+  %val = call i32 @llvm.amdgcn.workitem.id.y()
+  store volatile i32 %arg0, i32 addrspace(1)* undef
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}other_arg_use_workitem_id_z:
+; GCN: s_waitcnt
+; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 20, 10
+; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
+define void @other_arg_use_workitem_id_z(i32 %arg0) #1 {
+  %val = call i32 @llvm.amdgcn.workitem.id.z()
+  store volatile i32 %arg0, i32 addrspace(1)* undef
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x:
+
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: v_mov_b32_e32 v0, 0x22b
+; GCN: s_swappc_b64
+
+; GCN: .amdhsa_system_vgpr_workitem_id 0
+define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 {
+  call void @other_arg_use_workitem_id_x(i32 555)
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y:
+
+; UNPACKED-TID: v_lshlrev_b32_e32 v1, 10, v1
+; PACKED-TID:   v_mov_b32_e32 v1, v0
+; GCN-NOT: v1
+; GCN: v_mov_b32_e32 v0, 0x22b
+; GCN-NOT: v1
+; GCN: s_swappc_b64
+; GCN-NOT: v0
+
+; GCN: .amdhsa_system_vgpr_workitem_id 1
+define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 {
+  call void @other_arg_use_workitem_id_y(i32 555)
+  ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z:
+
+; GCN-DAG: v_mov_b32_e32 v0, 0x22b
+; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v1, 20, v2
+; PACKED-TID-DAG: v_mov_b32_e32 v1, v0
+; GCN: s_swappc_b64
+; GCN-NOT: v0
+
+; GCN: .amdhsa_system_vgpr_workitem_id 2
+define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
+  call void @other_arg_use_workitem_id_z(i32 555)
+  ret void
+}
+
+; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x:
+; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
+; GCN: v_and_b32_e32 v32, 0x3ff, v32
+; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
+; GCN: s_setpc_b64
+define void @too_many_args_use_workitem_id_x(
+  i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
+  i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
+  i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
+  i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) #1 {
+  %val = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %val, i32 addrspace(1)* undef
+
+  store volatile i32 %arg0, i32 addrspace(1)* undef
+  store volatile i32 %arg1, i32 addrspace(1)* undef
+  store volatile i32 %arg2, i32 addrspace(1)* undef
+  store volatile i32 %arg3, i32 addrspace(1)* undef
+  store volatile i32 %arg4, i32 addrspace(1)* undef
+  store volatile i32 %arg5, i32 addrspace(1)* undef
+  store volatile i32 %arg6, i32 addrspace(1)* undef
+  store volatile i32 %arg7, i32 addrspace(1)* undef
+
+  store volatile i32 %arg8, i32 addrspace(1)* undef
+  store volatile i32 %arg9, i32 addrspace(1)* undef
+  store volatile i32 %arg10, i32 addrspace(1)* undef
+  store volatile i32 %arg11, i32 addrspace(1)* undef
+  store volatile i32 %arg12, i32 addrspace(1)* undef
+  store volatile i32 %arg13, i32 addrspace(1)* undef
+  store volatile i32 %arg14, i32 addrspace(1)* undef
+  store volatile i32 %arg15, i32 addrspace(1)* undef
+
+  store volatile i32 %arg16, i32 addrspace(1)* undef
+  store volatile i32 %arg17, i32 addrspace(1)* undef
+  store volatile i32 %arg18, i32 addrspace(1)* undef
+  store volatile i32 %arg19, i32 addrspace(1)* undef
+  store volatile i32 %arg20, i32 addrspace(1)* undef
+  store volatile i32 %arg21, i32 addrspace(1)* undef
+  store volatile i32 %arg22, i32 addrspace(1)* undef
+  store volatile i32 %arg23, i32 addrspace(1)* undef
+
+  store volatile i32 %arg24, i32 addrspace(1)* undef
+  store volatile i32 %arg25, i32 addrspace(1)* undef
+  store volatile i32 %arg26, i32 addrspace(1)* undef
+  store volatile i32 %arg27, i32 addrspace(1)* undef
+  store volatile i32 %arg28, i32 addrspace(1)* undef
+  store volatile i32 %arg29, i32 addrspace(1)* undef
+  store volatile i32 %arg30, i32 addrspace(1)* undef
+  store volatile i32 %arg31, i32 addrspace(1)* undef
+
+  ret void
+}
+
+; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x:
+
+; GCN: s_mov_b32 s32, 0
+; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
+; GCN: s_swappc_b64
+
+; GCN: .amdhsa_system_vgpr_workitem_id 0
+define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
+  call void @too_many_args_use_workitem_id_x(
+    i32 10, i32 20, i32 30, i32 40,
+    i32 50, i32 60, i32 70, i32 80,
+    i32 90, i32 100, i32 110, i32 120,
+    i32 130, i32 140, i32 150, i32 160,
+    i32 170, i32 180, i32 190, i32 200,
+    i32 210, i32 220, i32 230, i32 240,
+    i32 250, i32 260, i32 270, i32 280,
+    i32 290, i32 300, i32 310, i32 320)
+  ret void
+}
+
+; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x:
+; GCN: s_mov_b32 s33, s32
+; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}}
+; GCN: s_swappc_b64
+define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
+  store volatile i32 %arg0, i32 addrspace(1)* undef
+  call void @too_many_args_use_workitem_id_x(
+    i32 10, i32 20, i32 30, i32 40,
+    i32 50, i32 60, i32 70, i32 80,
+    i32 90, i32 100, i32 110, i32 120,
+    i32 130, i32 140, i32 150, i32 160,
+    i32 170, i32 180, i32 190, i32 200,
+    i32 210, i32 220, i32 230, i32 240,
+    i32 250, i32 260, i32 270, i32 280,
+    i32 290, i32 300, i32 310, i32 320)
+  ret void
+}
+
+; Requires loading and storing to stack slot.
+; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x:
+; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}}
+; GCN-DAG: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-DAG: buffer_load_dword v32, off, s[0:3], s33{{$}}
+
+; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}}
+
+; GCN: s_swappc_b64
+
+; GCN: s_sub_u32 s32, s32, 0x400{{$}}
+; GCN: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN: s_setpc_b64
+define void @too_many_args_call_too_many_args_use_workitem_id_x(
+  i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
+  i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
+  i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
+  i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) #1 {
+  call void @too_many_args_use_workitem_id_x(
+    i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
+    i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
+    i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
+    i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31)
+  ret void
+}
+
+; stack layout:
+; frame[0] = byval arg32
+; frame[1] = stack passed workitem ID x
+; frame[2] = VGPR spill slot
+
+; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval:
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: v_and_b32_e32 v32, 0x3ff, v32
+; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32
+; GFX7: buffer_load_dword v0, off, s[0:3], s32 glc{{$}}
+; GFX90A: buffer_load_dword v0, off, s[0:3], s32 glc scc{{$}}
+; GCN: s_setpc_b64
+define void @too_many_args_use_workitem_id_x_byval(
+  i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
+  i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
+  i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
+  i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31, i32 addrspace(5)* byval(i32) %arg32) #1 {
+  %val = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %val, i32 addrspace(1)* undef
+
+  store volatile i32 %arg0, i32 addrspace(1)* undef
+  store volatile i32 %arg1, i32 addrspace(1)* undef
+  store volatile i32 %arg2, i32 addrspace(1)* undef
+  store volatile i32 %arg3, i32 addrspace(1)* undef
+  store volatile i32 %arg4, i32 addrspace(1)* undef
+  store volatile i32 %arg5, i32 addrspace(1)* undef
+  store volatile i32 %arg6, i32 addrspace(1)* undef
+  store volatile i32 %arg7, i32 addrspace(1)* undef
+
+  store volatile i32 %arg8, i32 addrspace(1)* undef
+  store volatile i32 %arg9, i32 addrspace(1)* undef
+  store volatile i32 %arg10, i32 addrspace(1)* undef
+  store volatile i32 %arg11, i32 addrspace(1)* undef
+  store volatile i32 %arg12, i32 addrspace(1)* undef
+  store volatile i32 %arg13, i32 addrspace(1)* undef
+  store volatile i32 %arg14, i32 addrspace(1)* undef
+  store volatile i32 %arg15, i32 addrspace(1)* undef
+
+  store volatile i32 %arg16, i32 addrspace(1)* undef
+  store volatile i32 %arg17, i32 addrspace(1)* undef
+  store volatile i32 %arg18, i32 addrspace(1)* undef
+  store volatile i32 %arg19, i32 addrspace(1)* undef
+  store volatile i32 %arg20, i32 addrspace(1)* undef
+  store volatile i32 %arg21, i32 addrspace(1)* undef
+  store volatile i32 %arg22, i32 addrspace(1)* undef
+  store volatile i32 %arg23, i32 addrspace(1)* undef
+
+  store volatile i32 %arg24, i32 addrspace(1)* undef
+  store volatile i32 %arg25, i32 addrspace(1)* undef
+  store volatile i32 %arg26, i32 addrspace(1)* undef
+  store volatile i32 %arg27, i32 addrspace(1)* undef
+  store volatile i32 %arg28, i32 addrspace(1)* undef
+  store volatile i32 %arg29, i32 addrspace(1)* undef
+  store volatile i32 %arg30, i32 addrspace(1)* undef
+  store volatile i32 %arg31, i32 addrspace(1)* undef
+  %private = load volatile i32, i32 addrspace(5)* %arg32
+  ret void
+}
+
+; sp[0] = byval
+; sp[1] = ??
+; sp[2] = stack passed workitem ID x
+
+; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval:
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
+; GCN-DAG: s_movk_i32 s32, 0x400
+
+; GCN: buffer_store_dword [[K]], off, s[0:3], 0 offset:4
+; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
+; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4
+
+; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
+; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
+; GCN: s_swappc_b64
+
+; GCN: .amdhsa_system_vgpr_workitem_id 0
+define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 {
+  %alloca = alloca i32, align 4, addrspace(5)
+  store volatile i32 999, i32 addrspace(5)* %alloca
+  call void @too_many_args_use_workitem_id_x_byval(
+    i32 10, i32 20, i32 30, i32 40,
+    i32 50, i32 60, i32 70, i32 80,
+    i32 90, i32 100, i32 110, i32 120,
+    i32 130, i32 140, i32 150, i32 160,
+    i32 170, i32 180, i32 190, i32 200,
+    i32 210, i32 220, i32 230, i32 240,
+    i32 250, i32 260, i32 270, i32 280,
+    i32 290, i32 300, i32 310, i32 320,
+    i32 addrspace(5)* %alloca)
+  ret void
+}
+
+; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
+; GFX7: buffer_store_dword [[K]], off, s[0:3], s33{{$}}
+; GFX90A: buffer_store_dword [[K]], off, s[0:3], s33 scc{{$}}
+; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
+; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}}
+; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
+; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
+; GCN: s_swappc_b64
+define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
+  %alloca = alloca i32, align 4, addrspace(5)
+  store volatile i32 999, i32 addrspace(5)* %alloca
+  call void @too_many_args_use_workitem_id_x_byval(
+    i32 10, i32 20, i32 30, i32 40,
+    i32 50, i32 60, i32 70, i32 80,
+    i32 90, i32 100, i32 110, i32 120,
+    i32 130, i32 140, i32 150, i32 160,
+    i32 170, i32 180, i32 190, i32 200,
+    i32 210, i32 220, i32 230, i32 240,
+    i32 250, i32 260, i32 270, i32 280,
+    i32 290, i32 300, i32 310, i32 320,
+    i32 addrspace(5)* %alloca)
+  ret void
+}
+
+; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz:
+; GCN:    buffer_load_dword v32, off, s[0:3], s32{{$}}
+; GCN:    v_and_b32_e32 v33, 0x3ff, v32
+; GFX7:   flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v33
+; GFX7:   v_bfe_u32 v33, v32, 10, 10
+; GFX90A: v_bfe_u32 v34, v32, 10, 10
+; GCN:    v_bfe_u32 v32, v32, 20, 10
+; GFX7:   flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v33{{$}}
+; GFX7:   flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32{{$}}
+; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v33, off scc{{$}}
+; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v34, off scc{{$}}
+; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v32, off scc{{$}}
+
+; GFX7-COUNT-32: flat_store_dword v{{\[[0-9]+:[0-9]+]}}
+; GFX90A-COUNT-32: global_store_dword v{{\[[0-9]+:[0-9]+]}}
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @too_many_args_use_workitem_id_xyz(
+  i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
+  i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
+  i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
+  i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) #1 {
+  %val0 = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %val0, i32 addrspace(1)* undef
+  %val1 = call i32 @llvm.amdgcn.workitem.id.y()
+  store volatile i32 %val1, i32 addrspace(1)* undef
+  %val2 = call i32 @llvm.amdgcn.workitem.id.z()
+  store volatile i32 %val2, i32 addrspace(1)* undef
+
+  store volatile i32 %arg0, i32 addrspace(1)* undef
+  store volatile i32 %arg1, i32 addrspace(1)* undef
+  store volatile i32 %arg2, i32 addrspace(1)* undef
+  store volatile i32 %arg3, i32 addrspace(1)* undef
+  store volatile i32 %arg4, i32 addrspace(1)* undef
+  store volatile i32 %arg5, i32 addrspace(1)* undef
+  store volatile i32 %arg6, i32 addrspace(1)* undef
+  store volatile i32 %arg7, i32 addrspace(1)* undef
+
+  store volatile i32 %arg8, i32 addrspace(1)* undef
+  store volatile i32 %arg9, i32 addrspace(1)* undef
+  store volatile i32 %arg10, i32 addrspace(1)* undef
+  store volatile i32 %arg11, i32 addrspace(1)* undef
+  store volatile i32 %arg12, i32 addrspace(1)* undef
+  store volatile i32 %arg13, i32 addrspace(1)* undef
+  store volatile i32 %arg14, i32 addrspace(1)* undef
+  store volatile i32 %arg15, i32 addrspace(1)* undef
+
+  store volatile i32 %arg16, i32 addrspace(1)* undef
+  store volatile i32 %arg17, i32 addrspace(1)* undef
+  store volatile i32 %arg18, i32 addrspace(1)* undef
+  store volatile i32 %arg19, i32 addrspace(1)* undef
+  store volatile i32 %arg20, i32 addrspace(1)* undef
+  store volatile i32 %arg21, i32 addrspace(1)* undef
+  store volatile i32 %arg22, i32 addrspace(1)* undef
+  store volatile i32 %arg23, i32 addrspace(1)* undef
+
+  store volatile i32 %arg24, i32 addrspace(1)* undef
+  store volatile i32 %arg25, i32 addrspace(1)* undef
+  store volatile i32 %arg26, i32 addrspace(1)* undef
+  store volatile i32 %arg27, i32 addrspace(1)* undef
+  store volatile i32 %arg28, i32 addrspace(1)* undef
+  store volatile i32 %arg29, i32 addrspace(1)* undef
+  store volatile i32 %arg30, i32 addrspace(1)* undef
+  store volatile i32 %arg31, i32 addrspace(1)* undef
+
+  ret void
+}
+
+; frame[0] = ID { Z, Y, X }
+
+; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz:
+
+; GCN-DAG: s_mov_b32 s32, 0
+
+; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v1, 10, v1
+; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, v1
+; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v2, 20, v2
+; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, v2
+; PACKED-TID-NOT: v0
+; PACKED-TID-NOT: v1
+; PACKED-TID-NOT: v2
+; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
+; GCN: s_swappc_b64
+
+; GCN: .amdhsa_system_vgpr_workitem_id 2
+define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
+  call void @too_many_args_use_workitem_id_xyz(
+    i32 10, i32 20, i32 30, i32 40,
+    i32 50, i32 60, i32 70, i32 80,
+    i32 90, i32 100, i32 110, i32 120,
+    i32 130, i32 140, i32 150, i32 160,
+    i32 170, i32 180, i32 190, i32 200,
+    i32 210, i32 220, i32 230, i32 240,
+    i32 250, i32 260, i32 270, i32 280,
+    i32 290, i32 300, i32 310, i32 320)
+  ret void
+}
+
+; workitem ID X in register, yz on stack
+; v31 = workitem ID X
+; frame[0] = workitem { Z, Y, X }
+
+; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_stack_yz:
+; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
+; GCN-DAG: {{flat|global}}_store_dword v[0:1], [[IDX]]
+; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
+; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDY]]
+; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
+; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDZ]]
+
+; GCN-COUNT-31: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}
+; GCN-NEXT: s_waitcnt
+; GCN: s_setpc_b64
+; GCN: ScratchSize: 0
+define void @too_many_args_use_workitem_id_x_stack_yz(
+  i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
+  i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
+  i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
+  i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30) #1 {
+  %val0 = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %val0, i32 addrspace(1)* undef
+  %val1 = call i32 @llvm.amdgcn.workitem.id.y()
+  store volatile i32 %val1, i32 addrspace(1)* undef
+  %val2 = call i32 @llvm.amdgcn.workitem.id.z()
+  store volatile i32 %val2, i32 addrspace(1)* undef
+
+  store volatile i32 %arg0, i32 addrspace(1)* undef
+  store volatile i32 %arg1, i32 addrspace(1)* undef
+  store volatile i32 %arg2, i32 addrspace(1)* undef
+  store volatile i32 %arg3, i32 addrspace(1)* undef
+  store volatile i32 %arg4, i32 addrspace(1)* undef
+  store volatile i32 %arg5, i32 addrspace(1)* undef
+  store volatile i32 %arg6, i32 addrspace(1)* undef
+  store volatile i32 %arg7, i32 addrspace(1)* undef
+
+  store volatile i32 %arg8, i32 addrspace(1)* undef
+  store volatile i32 %arg9, i32 addrspace(1)* undef
+  store volatile i32 %arg10, i32 addrspace(1)* undef
+  store volatile i32 %arg11, i32 addrspace(1)* undef
+  store volatile i32 %arg12, i32 addrspace(1)* undef
+  store volatile i32 %arg13, i32 addrspace(1)* undef
+  store volatile i32 %arg14, i32 addrspace(1)* undef
+  store volatile i32 %arg15, i32 addrspace(1)* undef
+
+  store volatile i32 %arg16, i32 addrspace(1)* undef
+  store volatile i32 %arg17, i32 addrspace(1)* undef
+  store volatile i32 %arg18, i32 addrspace(1)* undef
+  store volatile i32 %arg19, i32 addrspace(1)* undef
+  store volatile i32 %arg20, i32 addrspace(1)* undef
+  store volatile i32 %arg21, i32 addrspace(1)* undef
+  store volatile i32 %arg22, i32 addrspace(1)* undef
+  store volatile i32 %arg23, i32 addrspace(1)* undef
+
+  store volatile i32 %arg24, i32 addrspace(1)* undef
+  store volatile i32 %arg25, i32 addrspace(1)* undef
+  store volatile i32 %arg26, i32 addrspace(1)* undef
+  store volatile i32 %arg27, i32 addrspace(1)* undef
+  store volatile i32 %arg28, i32 addrspace(1)* undef
+  store volatile i32 %arg29, i32 addrspace(1)* undef
+  store volatile i32 %arg30, i32 addrspace(1)* undef
+
+  ret void
+}
+
+; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz:
+
+; GCN-NOT: v0
+; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v1, 10, v1
+; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, v1
+; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v2, 20, v2
+; UNPACKED-TID-DAG: v_or_b32_e32 v31, v0, v2
+; PACKED-TID: v_mov_b32_e32 v31, v0
+
+; GCN: s_mov_b32 s32, 0
+; GCN: s_swappc_b64
+
+; GCN: .amdhsa_system_vgpr_workitem_id 2
+define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 {
+  call void @too_many_args_use_workitem_id_x_stack_yz(
+    i32 10, i32 20, i32 30, i32 40,
+    i32 50, i32 60, i32 70, i32 80,
+    i32 90, i32 100, i32 110, i32 120,
+    i32 130, i32 140, i32 150, i32 160,
+    i32 170, i32 180, i32 190, i32 200,
+    i32 210, i32 220, i32 230, i32 240,
+    i32 250, i32 260, i32 270, i32 280,
+    i32 290, i32 300, i32 310)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare i32 @llvm.amdgcn.workitem.id.y() #0
+declare i32 @llvm.amdgcn.workitem.id.z() #0
+
+attributes #0 = { nounwind readnone speculatable "amdgpu-flat-work-group-size"="1,512" }
+attributes #1 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" }

diff  --git a/llvm/test/CodeGen/AMDGPU/clamp-omod-special-case.mir b/llvm/test/CodeGen/AMDGPU/clamp-omod-special-case.mir
index 58f69a8106f1..64327f7e6d0c 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp-omod-special-case.mir
+++ b/llvm/test/CodeGen/AMDGPU/clamp-omod-special-case.mir
@@ -55,10 +55,10 @@ body:             |
     %26 = V_LSHL_B64_e64 killed %25, 2, implicit $exec
     %16 = REG_SEQUENCE killed %4, 17, %12, 18
     %18 = COPY %26
-    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec
     %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 0, 0, implicit $mode, implicit $exec
-    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -117,10 +117,10 @@ body:             |
     %26 = V_LSHL_B64_e64 killed %25, 2, implicit $exec
     %16 = REG_SEQUENCE killed %4, 17, %12, 18
     %18 = COPY %26
-    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec
     %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 1, 3, implicit $mode, implicit $exec
-    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 ---
@@ -180,10 +180,10 @@ body:             |
     %26 = V_LSHL_B64_e64 killed %25, 2, implicit $exec
     %16 = REG_SEQUENCE killed %4, 17, %12, 18
     %18 = COPY %26
-    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec
     %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 0, 3, implicit $mode, implicit $exec
-    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -245,10 +245,10 @@ body:             |
     %26 = V_LSHL_B64_e64 killed %25, 2, implicit $exec
     %16 = REG_SEQUENCE killed %4, 17, %12, 18
     %18 = COPY %26
-    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec
     %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 1, 0, implicit $mode, implicit $exec
-    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -322,10 +322,10 @@ body:             |
     %26 = V_LSHL_B64_e64 killed %25, 2, implicit $exec
     %16 = REG_SEQUENCE killed %4, 17, %12, 18
     %18 = COPY %26
-    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec
     %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 0, 3, implicit $mode, implicit $exec
-    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -387,10 +387,10 @@ body:             |
     %26 = V_LSHL_B64_e64 killed %25, 2, implicit $exec
     %16 = REG_SEQUENCE killed %4, 17, %12, 18
     %18 = COPY %26
-    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec
     %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 1, 0, implicit $mode, implicit $exec
-    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir b/llvm/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir
index 64023cdd525f..d9fc00c522ab 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir
+++ b/llvm/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir
@@ -17,15 +17,15 @@ body:             |
 
     $vgpr0_vgpr1 = IMPLICIT_DEF
     $vgpr4_vgpr5 = IMPLICIT_DEF
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
-    $vgpr4 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    $vgpr4 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
     $vgpr2 = IMPLICIT_DEF
     $vgpr3 = IMPLICIT_DEF
     $vgpr6 = IMPLICIT_DEF
     $vgpr0 = V_ADD_CO_U32_e32 16, $vgpr2, implicit-def $vcc, implicit $exec
     $vgpr1 = V_ADDC_U32_e32 $vgpr3, killed $vgpr6, implicit-def dead $vcc, implicit $vcc, implicit $exec
-    FLAT_STORE_DWORD $vgpr2_vgpr3, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
-    FLAT_STORE_DWORD $vgpr0_vgpr1, killed $vgpr4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    FLAT_STORE_DWORD $vgpr2_vgpr3, killed $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    FLAT_STORE_DWORD $vgpr0_vgpr1, killed $vgpr4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir b/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir
index 80a201bbfdd0..1e307518c956 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir
+++ b/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir
@@ -14,7 +14,7 @@ registers:
 body:             |
   bb.0:
     %0 = IMPLICIT_DEF
-    %1 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    %1 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
     %2 = V_ADD_F32_e64 0, killed %1, 0, 1, 0, 0, implicit $mode, implicit $exec
-    %3 = FLAT_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    %3 = FLAT_LOAD_DWORD %0, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll b/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll
new file mode 100644
index 000000000000..ccac61d08f8d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll
@@ -0,0 +1,38 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+
+; Check that register coalescer does not create an odd subreg when register tuples
+; must be aligned.
+
+; GCN-LABEL: {{^}}test_odd_int4:
+; GCN:     global_load_dwordx4 v[{{[0-9]*[02468]:[0-9]*[13579]}}], v{{[0-9]+}}, s[{{[0-9:]+}}]
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]*}}[[LO:[02468]]], v{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]*}}[[HI:[13579]]], v{{[0-9]+}}
+; GCN:     global_store_dwordx2 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]], s[{{[0-9:]+}}]
+
+define amdgpu_kernel void @test_odd_int4(<4 x i32> addrspace(1)* %arg, <2 x i32> addrspace(1)* %arg1) {
+bb:
+  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %lid
+  %load = load <4 x i32>, <4 x i32> addrspace(1)* %gep1, align 16
+  %shuffle = shufflevector <4 x i32> %load, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+  %gep2 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %arg1, i32 %lid
+  store <2 x i32> %shuffle, <2 x i32> addrspace(1)* %gep2, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_vector_creation:
+; GCN:     global_load_dwordx2 v[{{[0-9]*[02468]}}:{{[0-9]+}}],
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]*}}[[LO:[02468]]], v{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]*}}[[HI:[13579]]], v{{[0-9]+}}
+; GCN:     global_store_dwordx4 v[{{[0-9]*[02468]:[0-9]*[13579]}}], v[{{[0-9]*[02468]:[0-9]*[13579]}}]
+define amdgpu_kernel void @test_vector_creation() {
+entry:
+  %tmp231 = load <4 x i16>, <4 x i16> addrspace(1)* undef, align 2
+  %vext466 = shufflevector <4 x i16> %tmp231, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit467 = shufflevector <8 x i16> undef, <8 x i16> %vext466, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
+  %vecinit471 = shufflevector <8 x i16> %vecinit467, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+  store <8 x i16> %vecinit471, <8 x i16> addrspace(1)* undef, align 16
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/coalescer-extend-pruned-subrange.mir b/llvm/test/CodeGen/AMDGPU/coalescer-extend-pruned-subrange.mir
index 0a0928a51813..62100ae33eaa 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-extend-pruned-subrange.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-extend-pruned-subrange.mir
@@ -30,7 +30,7 @@ body: |
     %14:vgpr_32 = V_AND_B32_e32 1, %13, implicit $exec
     %15:sreg_64_xexec = V_CMP_EQ_U32_e64 0, %14, implicit $exec
     %16:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %15, implicit $exec
-    BUFFER_STORE_DWORD_OFFEN_exact %16, undef %17:vgpr_32, undef %18:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into constant-pool, align 1, addrspace 4)
+    BUFFER_STORE_DWORD_OFFEN_exact %16, undef %17:vgpr_32, undef %18:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into constant-pool, align 1, addrspace 4)
     S_ENDPGM 0
 
   bb.2:
@@ -78,7 +78,7 @@ body: |
 
   bb.8:
     successors: %bb.10
-    %31:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %32:vgpr_32, undef %33:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4)
+    %31:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %32:vgpr_32, undef %33:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4)
     %34:sreg_64_xexec = V_CMP_NE_U32_e64 0, %31, implicit $exec
     %35:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, %34, implicit $exec
     %28:vgpr_32 = COPY %35

diff  --git a/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir b/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir
index f46199ec11b9..c8685717038a 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir
@@ -83,7 +83,7 @@ body:             |
 
   bb.9:
     successors: %bb.10(0x80000000)
-    %19:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %18, undef %20:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4)
+    %19:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %18, undef %20:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4)
     %21:sreg_64 = V_CMP_NE_U32_e64 target-flags(amdgpu-gotprel) 0, killed %19.sub0, implicit $exec
     %22:sreg_64 = COPY $exec, implicit-def $exec
     %23:sreg_64 = S_AND_B64 %22, %21, implicit-def dead $scc
@@ -125,7 +125,7 @@ body:             |
     %27.sub5:sgpr_256 = COPY %26
     %27.sub6:sgpr_256 = COPY %26
     %27.sub7:sgpr_256 = COPY killed %26
-    %28:vgpr_32 = IMAGE_LOAD_V1_V4 killed %25, killed %27, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4)
+    %28:vgpr_32 = IMAGE_LOAD_V1_V4 killed %25, killed %27, 2, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4)
     %29:vgpr_32 = nofpexcept V_ADD_F32_e32 0, killed %28, implicit $mode, implicit $exec
     $m0 = S_MOV_B32 -1
     DS_WRITE_B32 undef %30:vgpr_32, killed %29, 0, 0, implicit $m0, implicit $exec :: (store 4 into `i32 addrspace(3)* undef`, addrspace 3)

diff  --git a/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir b/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir
index afaba646e167..95e38af688c4 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir
@@ -68,7 +68,7 @@ body:             |
     %23:vreg_128 = COPY killed %17
     %24:sreg_64 = COPY killed %16
     %25:vgpr_32 = V_OR_B32_e32 %22, %11, implicit $exec
-    %26:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %25, undef %27:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4)
+    %26:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %25, undef %27:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4)
     %28:vgpr_32 = V_LSHRREV_B32_e32 30, killed %26.sub0, implicit $exec
     %29:vreg_128 = COPY killed %21
     %29.sub0:vreg_128 = COPY %1
@@ -257,7 +257,7 @@ body:             |
     %109.sub5:sgpr_256 = COPY %108
     %109.sub6:sgpr_256 = COPY %108
     %109.sub7:sgpr_256 = COPY killed %108
-    %110:vgpr_32 = IMAGE_SAMPLE_V1_V2 killed %107, killed %109, undef %111:sgpr_128, 8, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4)
+    %110:vgpr_32 = IMAGE_SAMPLE_V1_V2 killed %107, killed %109, undef %111:sgpr_128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4)
     %112:vgpr_32 = nofpexcept V_MUL_F32_e32 0, killed %110, implicit $mode, implicit $exec
     %113:vgpr_32 = nofpexcept V_MUL_F32_e32 0, killed %112, implicit $mode, implicit $exec
     %114:vgpr_32 = nofpexcept V_MAD_F32_e64 0, killed %113, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir b/llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir
index 34c83f9bae49..5d432ab157a2 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir
@@ -61,7 +61,7 @@ body:             |
     %11.sub6 = COPY %1
     %11.sub7 = COPY %1
     %11.sub8 = COPY %1
-    dead %18 = IMAGE_SAMPLE_C_D_O_V1_V16 %11, %3, %4, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load 4)
+    dead %18 = IMAGE_SAMPLE_C_D_O_V1_V16 %11, %3, %4, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load 4)
     %20.sub1 = COPY %2
     %20.sub2 = COPY %2
     %20.sub3 = COPY %2
@@ -70,6 +70,6 @@ body:             |
     %20.sub6 = COPY %2
     %20.sub7 = COPY %2
     %20.sub8 = COPY %2
-    dead %27 = IMAGE_SAMPLE_C_D_O_V1_V16 %20, %5, %6, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load 4)
+    dead %27 = IMAGE_SAMPLE_C_D_O_V1_V16 %20, %5, %6, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load 4)
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir b/llvm/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir
index 2928d2ebe3e8..e50b0c835735 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir
@@ -11,7 +11,7 @@
 #
 # GCN-LABEL: bb.6:
 # GCN: successors: %bb.7(0x{{[0-9]+}}), %bb.18(0x{{[0-9]+}})
-# GCN: %{{[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %{{[0-9]+}}, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+# GCN: %{{[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %{{[0-9]+}}, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 #
 
 --- |
@@ -69,7 +69,7 @@ body: |
     %10:sreg_64 = COPY killed %5
     undef %11.sub2:sgpr_128 = COPY %4
     %11.sub3:sgpr_128 = COPY %3
-    %12:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET killed %11, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %12:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET killed %11, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     undef %13.sub1:vreg_128 = COPY %9.sub1
     %13.sub2:vreg_128 = COPY %9.sub2
     %14:sreg_64 = nofpexcept V_CMP_GT_F32_e64 0, target-flags(amdgpu-rel32-lo) 0, 0, killed %12.sub3, 0, implicit $mode, implicit $exec
@@ -161,7 +161,7 @@ body: |
   bb.18:
     successors: %bb.7(0x80000000)
     dead %59:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %9.sub2, 0, undef %60:vgpr_32, 0, undef %61:vgpr_32, 0, 0, implicit $mode, implicit $exec
-    dead %62:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %63:vgpr_32, undef %64:sgpr_128, undef %65:sreg_32, 0, 0, 0, 0, 0, 0, implicit $exec
+    dead %62:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %63:vgpr_32, undef %64:sgpr_128, undef %65:sreg_32, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     undef %66.sub1:vreg_128 = COPY %13.sub1
     %66.sub2:vreg_128 = COPY %13.sub2
     %67:sreg_64 = nofpexcept V_CMP_NGT_F32_e64 0, 0, 0, undef %68:vgpr_32, 0, implicit $mode, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir b/llvm/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir
index f0fdda9f087d..0eeb9d4dcc93 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir
@@ -47,7 +47,7 @@ body: |
     %4.sub5:sgpr_256 = COPY %1
     %4.sub6:sgpr_256 = COPY %1
     %4.sub7:sgpr_256 = COPY killed %1
-    %5:vgpr_32 = IMAGE_LOAD_V1_V4 killed %3, killed %4, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4)
+    %5:vgpr_32 = IMAGE_LOAD_V1_V4 killed %3, killed %4, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4)
     %6:vgpr_32 = nofpexcept V_MAD_F32_e64 0, killed %5, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     %7:vgpr_32 = nofpexcept V_RCP_F32_e32 killed %6, implicit $mode, implicit $exec
     %8:vgpr_32 = nofpexcept V_MUL_F32_e32 0, killed %7, implicit $mode, implicit $exec
@@ -148,7 +148,7 @@ body: |
     %43:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %44:sgpr_128, 12, 0, 0 :: (dereferenceable invariant load 4)
     %45:vgpr_32 = V_MUL_LO_I32_e64 killed %42, killed %43, implicit $exec
     %46:vgpr_32 = V_LSHLREV_B32_e32 2, killed %45, implicit $exec
-    %47:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN killed %46, undef %48:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4)
+    %47:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN killed %46, undef %48:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4)
     %49:sreg_64 = V_CMP_NE_U32_e64 0, killed %47, implicit $exec
     %50:sreg_64 = COPY $exec, implicit-def $exec
     %51:sreg_64 = S_AND_B64 %50, %49, implicit-def dead $scc

diff  --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf2.mir b/llvm/test/CodeGen/AMDGPU/collapse-endcf2.mir
index 666dfd74c500..3569085bdd07 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf2.mir
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf2.mir
@@ -42,7 +42,7 @@ body:             |
   ; GCN:   %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec
   ; GCN:   %5.sub3:sgpr_128 = S_MOV_B32 61440
   ; GCN:   %5.sub2:sgpr_128 = S_MOV_B32 0
-  ; GCN:   BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+  ; GCN:   BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
   ; GCN:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec
   ; GCN:   [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
   ; GCN:   [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
@@ -54,7 +54,7 @@ body:             |
   ; GCN:   %5.sub0:sgpr_128 = COPY %5.sub2
   ; GCN:   %5.sub1:sgpr_128 = COPY %5.sub2
   ; GCN:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-  ; GCN:   BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+  ; GCN:   BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
   ; GCN: bb.3:
   ; GCN:   successors: %bb.4(0x80000000)
   ; GCN:   $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
@@ -91,7 +91,7 @@ body:             |
     %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec
     %5.sub3:sgpr_128 = S_MOV_B32 61440
     %5.sub2:sgpr_128 = S_MOV_B32 0
-    BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec
     %12:sreg_64 = COPY $exec, implicit-def $exec
     %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc
@@ -103,7 +103,7 @@ body:             |
     %5.sub0:sgpr_128 = COPY %5.sub2
     %5.sub1:sgpr_128 = COPY %5.sub2
     %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
 
   bb.3:
     $exec = S_OR_B64 $exec, %12,  implicit-def $scc

diff  --git a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
index 10f21e3efce0..154ea71b7113 100644
--- a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
@@ -37,7 +37,7 @@ body:             |
     %8:sreg_32_xm0 = S_MOV_B32 9999
     %9:sreg_32_xm0 = S_AND_B32 killed %7, killed %8, implicit-def dead $scc
     %10:vgpr_32 = COPY %9
-    BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -85,19 +85,19 @@ body:             |
     %16:vgpr_32 = V_MOV_B32_e32 63, implicit $exec
 
     %9:vgpr_32 = V_AND_B32_e64 %8, %6, implicit $exec
-    FLAT_STORE_DWORD %19, %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %19, %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %10:vgpr_32 = V_AND_B32_e64 %6, %8, implicit $exec
-    FLAT_STORE_DWORD %19, %10, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %19, %10, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %11:vgpr_32 = V_AND_B32_e32 %8, %6, implicit $exec
-    FLAT_STORE_DWORD %19, %11, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %19, %11, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %12:vgpr_32 = V_AND_B32_e64 %8, %8, implicit $exec
-    FLAT_STORE_DWORD %19, %12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %19, %12, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %13:vgpr_32 = V_AND_B32_e64 %16, %16, implicit $exec
-    FLAT_STORE_DWORD %19, %13, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %19, %13, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     S_ENDPGM 0
 
@@ -116,7 +116,6 @@ body:             |
   bb.0:
     liveins: $sgpr0_sgpr1
 
-
     %0:sgpr_64 = COPY $sgpr0_sgpr1
     %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0, 0
     %5:sreg_32_xm0_xexec = S_MOV_B32 1
@@ -127,7 +126,7 @@ body:             |
     %10:sgpr_128 = REG_SEQUENCE killed %7, %subreg.hi16, killed %6, %subreg.lo16, killed %9, %subreg.sub0, killed %8, %subreg.sub0_sub1
     %12:sreg_32_xm0 = S_LSHL_B32 killed %5, 12, implicit-def dead $scc
     %13:vgpr_32 = COPY %12
-    BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -154,7 +153,7 @@ body:             |
     %8:sgpr_128 = REG_SEQUENCE killed %5, %subreg.hi16, killed %4, %subreg.lo16, killed %7, %subreg.sub0, killed %6, %subreg.sub0_sub1
     %10:sreg_32_xm0 = S_ASHR_I32 killed %3, 12, implicit-def dead $scc
     %11:vgpr_32 = COPY %10
-    BUFFER_STORE_DWORD_OFFSET killed %11, killed %8, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed %11, killed %8, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -222,34 +221,34 @@ body:             |
     %32:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
 
     %11:vgpr_32 = V_ASHRREV_I32_e64 8, %10, implicit $exec
-    FLAT_STORE_DWORD %20, %11, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %11, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %12:vgpr_32 = V_ASHRREV_I32_e64 %8, %10, implicit $exec
-    FLAT_STORE_DWORD %20, %12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %12, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %13:vgpr_32 = V_ASHR_I32_e64 %7, 3, implicit $exec
-    FLAT_STORE_DWORD %20, %13, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %13, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %14:vgpr_32 = V_ASHR_I32_e64 7, %29, implicit $exec
-    FLAT_STORE_DWORD %20, %14, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %14, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %15:vgpr_32 = V_ASHR_I32_e64 %27, %24, implicit $exec
-    FLAT_STORE_DWORD %20, %15, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %15, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %22:vgpr_32 = V_ASHR_I32_e64 %6, 4, implicit $exec
-    FLAT_STORE_DWORD %20, %22, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %22, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %23:vgpr_32 = V_ASHR_I32_e64 %6, %30, implicit $exec
-    FLAT_STORE_DWORD %20, %23, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %23, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %25:vgpr_32 = V_ASHR_I32_e32 %31, %31, implicit $exec
-    FLAT_STORE_DWORD %20, %25, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %25, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %26:vgpr_32 = V_ASHRREV_I32_e32 11, %10, implicit $exec
-    FLAT_STORE_DWORD %20, %26, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %26, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %28:vgpr_32 = V_ASHR_I32_e32 %27, %32, implicit $exec
-    FLAT_STORE_DWORD %20, %28, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %28, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     S_ENDPGM 0
 
@@ -277,7 +276,7 @@ body:             |
     %8:sgpr_128 = REG_SEQUENCE killed %5, %subreg.hi16, killed %4, %subreg.lo16, killed %7, %subreg.sub0, killed %6, %subreg.sub0_sub1
     %10:sreg_32_xm0 = S_LSHR_B32 killed %3, 12, implicit-def dead $scc
     %11:vgpr_32 = COPY %10
-    BUFFER_STORE_DWORD_OFFSET killed %11, killed %8, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed %11, killed %8, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -346,34 +345,34 @@ body:             |
     %32:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
 
     %11:vgpr_32 = V_LSHRREV_B32_e64 8, %10, implicit $exec
-    FLAT_STORE_DWORD %20, %11, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %11, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %12:vgpr_32 = V_LSHRREV_B32_e64 %8, %10, implicit $exec
-    FLAT_STORE_DWORD %20, %12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %12, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %13:vgpr_32 = V_LSHR_B32_e64 %7, 3, implicit $exec
-    FLAT_STORE_DWORD %20, %13, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %13, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %14:vgpr_32 = V_LSHR_B32_e64 7, %29, implicit $exec
-    FLAT_STORE_DWORD %20, %14, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %14, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %15:vgpr_32 = V_LSHR_B32_e64 %27, %24, implicit $exec
-    FLAT_STORE_DWORD %20, %15, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %15, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %22:vgpr_32 = V_LSHR_B32_e64 %6, 4, implicit $exec
-    FLAT_STORE_DWORD %20, %22, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %22, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %23:vgpr_32 = V_LSHR_B32_e64 %6, %30, implicit $exec
-    FLAT_STORE_DWORD %20, %23, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %23, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %25:vgpr_32 = V_LSHR_B32_e32 %31, %31, implicit $exec
-    FLAT_STORE_DWORD %20, %25, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %25, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %26:vgpr_32 = V_LSHRREV_B32_e32 11, %10, implicit $exec
-    FLAT_STORE_DWORD %20, %26, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %26, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %28:vgpr_32 = V_LSHR_B32_e32 %27, %32, implicit $exec
-    FLAT_STORE_DWORD %20, %28, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %28, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 
 ...
@@ -391,7 +390,7 @@ body:             |
   bb.0:
     %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     %2:vgpr_32 = V_XOR_B32_e64 killed %0, undef %1:vgpr_32, implicit $exec
-    FLAT_STORE_DWORD undef %3:vreg_64, %2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD undef %3:vreg_64, %2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 
 ...
@@ -445,7 +444,7 @@ body:             |
     %8:sreg_32_xm0 = S_MOV_B32 9999
     %9:sreg_32_xm0 = S_ANDN2_B32 killed %7, killed %8, implicit-def dead $scc
     %10:vgpr_32 = COPY %9
-    BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -478,7 +477,7 @@ body:             |
     %8:sreg_32_xm0 = S_MOV_B32 9999
     %9:sreg_32_xm0 = S_OR_B32 killed %7, killed %8, implicit-def dead $scc
     %10:vgpr_32 = COPY %9
-    BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -530,15 +529,15 @@ body:             |
     %8:sreg_32_xm0 = S_MOV_B32 1234567
     %16:vgpr_32 = V_MOV_B32_e32 63, implicit $exec
     %9:vgpr_32 = V_OR_B32_e64 %8, %6, implicit $exec
-    FLAT_STORE_DWORD %19, %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %19, %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     %10:vgpr_32 = V_OR_B32_e64 %6, %8, implicit $exec
-    FLAT_STORE_DWORD %19, %10, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %19, %10, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     %11:vgpr_32 = V_OR_B32_e32 %8, %6, implicit $exec
-    FLAT_STORE_DWORD %19, %11, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %19, %11, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     %12:vgpr_32 = V_OR_B32_e64 %8, %8, implicit $exec
-    FLAT_STORE_DWORD %19, %12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %19, %12, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     %13:vgpr_32 = V_OR_B32_e64 %16, %16, implicit $exec
-    FLAT_STORE_DWORD %19, %13, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %19, %13, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 
 ...
@@ -571,7 +570,7 @@ body:             |
     %8:sreg_32_xm0 = S_MOV_B32 9999
     %9:sreg_32_xm0 = S_ORN2_B32 killed %7, killed %8, implicit-def dead $scc
     %10:vgpr_32 = COPY %9
-    BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -604,7 +603,7 @@ body:             |
     %8:sreg_32_xm0 = S_MOV_B32 9999
     %9:sreg_32_xm0 = S_NAND_B32 killed %7, killed %8, implicit-def dead $scc
     %10:vgpr_32 = COPY %9
-    BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -637,7 +636,7 @@ body:             |
     %8:sreg_32_xm0 = S_MOV_B32 9999
     %9:sreg_32_xm0 = S_NOR_B32 killed %7, killed %8, implicit-def dead $scc
     %10:vgpr_32 = COPY %9
-    BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -670,7 +669,7 @@ body:             |
     %8:sreg_32_xm0 = S_MOV_B32 9999
     %9:sreg_32_xm0 = S_XNOR_B32 killed %7, killed %8, implicit-def dead $scc
     %10:vgpr_32 = COPY %9
-    BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -739,25 +738,25 @@ body:             |
     %7:sreg_32_xm0 = S_MOV_B32 1
     %27:sreg_32_xm0 = S_MOV_B32 -4
     %11:vgpr_32 = V_LSHLREV_B32_e64 12, %10, implicit $exec
-    FLAT_STORE_DWORD %20, %11, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %11, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     %12:vgpr_32 = V_LSHLREV_B32_e64 %7, 12, implicit $exec
-    FLAT_STORE_DWORD %20, %12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %12, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     %13:vgpr_32 = V_LSHL_B32_e64 %7, 12, implicit $exec
-    FLAT_STORE_DWORD %20, %13, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %13, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     %14:vgpr_32 = V_LSHL_B32_e64 12, %7, implicit $exec
-    FLAT_STORE_DWORD %20, %14, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %14, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     %15:vgpr_32 = V_LSHL_B32_e64 12, %24, implicit $exec
-    FLAT_STORE_DWORD %20, %15, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %15, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     %22:vgpr_32 = V_LSHL_B32_e64 %6, 12, implicit $exec
-    FLAT_STORE_DWORD %20, %22, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %22, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     %23:vgpr_32 = V_LSHL_B32_e64 %6, 32, implicit $exec
-    FLAT_STORE_DWORD %20, %23, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %23, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     %25:vgpr_32 = V_LSHL_B32_e32 %6, %6, implicit $exec
-    FLAT_STORE_DWORD %20, %25, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %25, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     %26:vgpr_32 = V_LSHLREV_B32_e32 11, %24, implicit $exec
-    FLAT_STORE_DWORD %20, %26, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %26, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     %28:vgpr_32 = V_LSHL_B32_e32 %27, %6, implicit $exec
-    FLAT_STORE_DWORD %20, %28, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD %20, %28, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir b/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir
new file mode 100644
index 000000000000..462a0127a821
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir
@@ -0,0 +1,160 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX90A %s
+
+---
+name: copy_v64_to_v64
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr2_vgpr3
+    ; GFX908-LABEL: name: copy_v64_to_v64
+    ; GFX908: liveins: $vgpr2_vgpr3
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
+    ; GFX90A-LABEL: name: copy_v64_to_v64
+    ; GFX90A: liveins: $vgpr2_vgpr3
+    ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
+    $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, implicit $exec
+...
+
+---
+name: copy_s64_to_v64
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr2_sgpr3
+    ; GFX908-LABEL: name: copy_s64_to_v64
+    ; GFX908: liveins: $sgpr2_sgpr3
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec
+    ; GFX90A-LABEL: name: copy_s64_to_v64
+    ; GFX90A: liveins: $sgpr2_sgpr3
+    ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr2_sgpr3, 12, $sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec
+    $vgpr0_vgpr1 = COPY killed $sgpr2_sgpr3, implicit $exec
+...
+
+---
+name: copy_a64_to_v64
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $agpr2_agpr3
+    ; GFX908-LABEL: name: copy_a64_to_v64
+    ; GFX908: liveins: $agpr2_agpr3
+    ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3
+    ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec
+    ; GFX90A-LABEL: name: copy_a64_to_v64
+    ; GFX90A: liveins: $agpr2_agpr3
+    ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3
+    ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec
+    $vgpr0_vgpr1 = COPY killed $agpr2_agpr3, implicit $exec
+...
+
+---
+name: copy_v128_to_v128_fwd
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX908-LABEL: name: copy_v128_to_v128_fwd
+    ; GFX908: liveins: $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX908: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec
+    ; GFX90A-LABEL: name: copy_v128_to_v128_fwd
+    ; GFX90A: liveins: $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX90A: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr4_vgpr5, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec
+...
+
+---
+name: copy_v128_to_v128_back
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX908-LABEL: name: copy_v128_to_v128_back
+    ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX908: $vgpr5 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX908: $vgpr4 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX908: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
+    ; GFX90A-LABEL: name: copy_v128_to_v128_back
+    ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX90A: $vgpr4_vgpr5 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX90A: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
+    $vgpr2_vgpr3_vgpr4_vgpr5 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
+...
+
+---
+name: copy_v96_to_v96
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr4_vgpr5_vgpr6
+    ; GFX908-LABEL: name: copy_v96_to_v96
+    ; GFX908: liveins: $vgpr4_vgpr5_vgpr6
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6
+    ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec
+    ; GFX90A-LABEL: name: copy_v96_to_v96
+    ; GFX90A: liveins: $vgpr4_vgpr5_vgpr6
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6
+    ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6
+    ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec
+    $vgpr0_vgpr1_vgpr2 = COPY killed $vgpr4_vgpr5_vgpr6, implicit $exec
+...
+
+---
+name: copy_v64_to_v64_undef_sub0
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr3
+    ; GFX908-LABEL: name: copy_v64_to_v64_undef_sub0
+    ; GFX908: liveins: $vgpr3
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
+    ; GFX90A-LABEL: name: copy_v64_to_v64_undef_sub0
+    ; GFX90A: liveins: $vgpr3
+    ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
+    $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, implicit $exec
+...
+
+---
+name: copy_v64_to_v64_undef_sub1
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr2
+    ; GFX908-LABEL: name: copy_v64_to_v64_undef_sub1
+    ; GFX908: liveins: $vgpr2
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
+    ; GFX90A-LABEL: name: copy_v64_to_v64_undef_sub1
+    ; GFX90A: liveins: $vgpr2
+    ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
+    $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, implicit $exec
+...
+
+---
+name: copy_s128_to_v128_killed
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX908-LABEL: name: copy_s128_to_v128_killed
+    ; GFX908: liveins: $sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX908: $vgpr3 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX90A-LABEL: name: copy_s128_to_v128_killed
+    ; GFX90A: liveins: $sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr4_sgpr5, 12, $sgpr4_sgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX90A: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $sgpr6_sgpr7, 12, $sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $sgpr4_sgpr5_sgpr6_sgpr7
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir b/llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir
index 9880a8ffc531..0a8b3d89c817 100644
--- a/llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir
+++ b/llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir
@@ -37,7 +37,7 @@ body:             |
   ; GCN:   S_BRANCH %bb.3
   ; GCN: bb.3:
   ; GCN:   successors: %bb.4(0x80000000)
-  ; GCN:   dead %16:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]].sub3, undef %17:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4)
+  ; GCN:   dead %16:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]].sub3, undef %17:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4)
   ; GCN:   dead %18:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
   ; GCN:   [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, -1, implicit-def dead $scc
   ; GCN:   dead %20:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -89,7 +89,7 @@ body:             |
     S_BRANCH %bb.3
 
   bb.3:
-    dead %22:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %53.sub3, undef %24:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4)
+    dead %22:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %53.sub3, undef %24:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4)
     dead %60:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
     %36:sreg_64 = S_AND_B64 $exec, -1, implicit-def dead $scc
     dead %67:vgpr_32 = V_MOV_B32_e32 0, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir
index 9c0f34ec4c53..a4c8fdfa77df 100644
--- a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir
+++ b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir
@@ -66,9 +66,9 @@ body:             |
   ; CHECK:   dead %16:vgpr_32 = COPY %11.sub0
   ; CHECK:   undef %17.sub0:vreg_64, %18:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF4]].sub0, [[DEF6]].sub0, 0, implicit $exec
   ; CHECK:   dead undef %17.sub1:vreg_64, dead %19:sreg_64_xexec = V_ADDC_U32_e64 [[DEF4]].sub1, [[DEF6]].sub1, %18, 0, implicit $exec
-  ; CHECK:   [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF1]], 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
+  ; CHECK:   [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF1]], 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
   ; CHECK:   dead %20:sreg_64 = V_CMP_GT_I32_e64 4, [[DEF7]], implicit $exec
-  ; CHECK:   GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF8]], 288, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
+  ; CHECK:   GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF8]], 288, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
   ; CHECK: bb.2:
   ; CHECK:   successors: %bb.3(0x80000000)
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
@@ -79,8 +79,8 @@ body:             |
   ; CHECK: bb.4:
   ; CHECK:   successors: %bb.5(0x80000000)
   ; CHECK:   dead %21:sreg_64 = COPY $exec
-  ; CHECK:   dead %22:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1)
-  ; CHECK:   DBG_VALUE %22,
+  ; CHECK:   dead %22:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1)
+  ; CHECK:   DBG_VALUE %22, $noreg, <0x{{[0-9a-f]+}}>, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !DILocation(line: 0, scope: <0x{{[0-9a-f]+}}>)
   ; CHECK: bb.5:
   ; CHECK:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
   ; CHECK:   S_CBRANCH_EXECZ %bb.3, implicit $exec
@@ -109,9 +109,9 @@ body:             |
     dead %16:vgpr_32 = COPY %11.sub0
     undef %17.sub0:vreg_64, %18:sreg_64_xexec = V_ADD_CO_U32_e64 %6.sub0, %8.sub0, 0, implicit $exec
     dead %17.sub1:vreg_64, dead %19:sreg_64_xexec = V_ADDC_U32_e64 %6.sub1, %8.sub1, %18, 0, implicit $exec
-    %6:vreg_64 = GLOBAL_LOAD_DWORDX2 %3, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
+    %6:vreg_64 = GLOBAL_LOAD_DWORDX2 %3, 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
     dead %20:sreg_64 = V_CMP_GT_I32_e64 4, %9, implicit $exec
-    GLOBAL_STORE_DWORDX2 %0, %10, 288, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
+    GLOBAL_STORE_DWORDX2 %0, %10, 288, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
 
   bb.2:
     %5:vgpr_32 = COPY %13
@@ -122,7 +122,7 @@ body:             |
 
   bb.4:
     dead %21:sreg_64 = COPY $exec
-    %22:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1)
+    %22:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1)
     DBG_VALUE %22, $noreg, !16, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !21
 
   bb.5:

diff  --git a/llvm/test/CodeGen/AMDGPU/dead-lane.mir b/llvm/test/CodeGen/AMDGPU/dead-lane.mir
index 8e95009e72c6..b91076112a0e 100644
--- a/llvm/test/CodeGen/AMDGPU/dead-lane.mir
+++ b/llvm/test/CodeGen/AMDGPU/dead-lane.mir
@@ -12,7 +12,7 @@ body:             |
     %1:vgpr_32 = nofpexcept V_MAC_F32_e32 undef %0:vgpr_32, undef %0:vgpr_32, undef %0:vgpr_32, implicit $mode, implicit $exec
     %2:vgpr_32 = nofpexcept V_MAC_F32_e32 undef %0:vgpr_32, undef %0:vgpr_32, undef %0:vgpr_32, implicit $mode, implicit $exec
     %3:vreg_64 = REG_SEQUENCE %1:vgpr_32, %subreg.sub0, %2:vgpr_32, %subreg.sub1
-    FLAT_STORE_DWORD undef %4:vreg_64, %3.sub0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD undef %4:vreg_64, %3.sub0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/dead_copy.mir b/llvm/test/CodeGen/AMDGPU/dead_copy.mir
index a8b56864e85f..ebfd720a9dcf 100644
--- a/llvm/test/CodeGen/AMDGPU/dead_copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/dead_copy.mir
@@ -23,5 +23,5 @@ body:    |
     $vgpr10 = COPY killed $sgpr14, implicit $exec
     $vgpr11 = COPY killed $sgpr15, implicit $exec
 
-    FLAT_STORE_DWORDX4 $vgpr10_vgpr11, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORDX4 $vgpr10_vgpr11, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
index 4d7c4a0ea6f3..e1d0b38465c5 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
+++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
@@ -71,7 +71,7 @@ body:             |
   ; CHECK:   dead %26:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
   ; CHECK:   dead %27:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec
   ; CHECK:   dead %28:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, [[DEF6]], 0, [[DEF3]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK:   GLOBAL_STORE_DWORD [[DEF]], [[DEF10]], 0, 0, 0, 0, implicit $exec
+  ; CHECK:   GLOBAL_STORE_DWORD [[DEF]], [[DEF10]], 0, 0, 0, 0, 0, implicit $exec
   ; CHECK:   S_ENDPGM 0
   bb.0:
     successors: %bb.1
@@ -129,7 +129,7 @@ body:             |
     %26:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, %4, 0, %1, 0, 0, implicit $mode, implicit $exec
     %27:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, %5, 0, %2, 0, 0, implicit $mode, implicit $exec
     %28:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, %6, 0, %3, 0, 0, implicit $mode, implicit $exec
-    GLOBAL_STORE_DWORD %0, %11, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, %11, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
new file mode 100644
index 000000000000..c8968d670877
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,GFX90A
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10
+
+; GCN-LABEL: {{^}}dpp64_ceil:
+; GCN:           global_load_dwordx2 [[V:v\[[0-9:]+\]]],
+; DPP64:         v_ceil_f64_dpp [[V]], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:0{{$}}
+; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:0{{$}}
+define amdgpu_kernel void @dpp64_ceil(i64 addrspace(1)* %arg, i64 %in1) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id
+  %load = load i64, i64 addrspace(1)* %gep
+  %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 337, i32 15, i32 15, i1 1) #0
+  %tmp1 = bitcast i64 %tmp0 to double
+  %round = tail call double @llvm.ceil.f64(double %tmp1)
+  %tmp2 = bitcast double %round to i64
+  store i64 %tmp2, i64 addrspace(1)* %gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}dpp64_rcp:
+; GCN:           global_load_dwordx2 [[V:v\[[0-9:]+\]]],
+; DPP64:         v_rcp_f64_dpp [[V]], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:0{{$}}
+; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:0{{$}}
+define amdgpu_kernel void @dpp64_rcp(i64 addrspace(1)* %arg, i64 %in1) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id
+  %load = load i64, i64 addrspace(1)* %gep
+  %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 337, i32 15, i32 15, i1 1) #0
+  %tmp1 = bitcast i64 %tmp0 to double
+  %rcp = call double @llvm.amdgcn.rcp.f64(double %tmp1)
+  %tmp2 = bitcast double %rcp to i64
+  store i64 %tmp2, i64 addrspace(1)* %gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}dpp64_rcp_unsupported_ctl:
+; GCN-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0{{$}}
+; GCN:         v_rcp_f64_e32
+define amdgpu_kernel void @dpp64_rcp_unsupported_ctl(i64 addrspace(1)* %arg, i64 %in1) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id
+  %load = load i64, i64 addrspace(1)* %gep
+  %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 15, i32 15, i1 1) #0
+  %tmp1 = bitcast i64 %tmp0 to double
+  %rcp = fdiv fast double 1.0, %tmp1
+  %tmp2 = bitcast double %rcp to i64
+  store i64 %tmp2, i64 addrspace(1)* %gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}dpp64_div:
+; GCN:            global_load_dwordx2 [[V:v\[[0-9:]+\]]],
+; GFX90A-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:0{{$}}
+; GFX10-COUNT-2:  v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:0{{$}}
+; GCN:            v_div_scale_f64
+; GCN:            v_rcp_f64_e32
+define amdgpu_kernel void @dpp64_div(i64 addrspace(1)* %arg, i64 %in1) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id
+  %load = load i64, i64 addrspace(1)* %gep
+  %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 337, i32 15, i32 15, i1 1) #0
+  %tmp1 = bitcast i64 %tmp0 to double
+  %rcp = fdiv double 15.0, %tmp1
+  %tmp2 = bitcast double %rcp to i64
+  store i64 %tmp2, i64 addrspace(1)* %gep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32, i32, i32, i1) #0
+declare double @llvm.ceil.f64(double)
+declare double @llvm.amdgcn.rcp.f64(double)
+
+attributes #0 = { nounwind readnone convergent }

diff  --git a/llvm/test/CodeGen/AMDGPU/dpp64_combine.mir b/llvm/test/CodeGen/AMDGPU/dpp64_combine.mir
new file mode 100644
index 000000000000..4ccfdb97254b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dpp64_combine.mir
@@ -0,0 +1,51 @@
+# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=GCN
+
+---
+# GCN-LABEL: name: dpp64_old_impdef
+# GCN: %3:vreg_64 = V_CEIL_F64_dpp %1, 0, %0, 337, 15, 15, 1, implicit $mode, implicit $exec
+---
+name: dpp64_old_impdef
+tracksRegLiveness: true
+body: |
+  bb.0:
+    %0:vreg_64 = IMPLICIT_DEF
+    %1:vreg_64 = IMPLICIT_DEF
+    %2:vreg_64 = V_MOV_B64_DPP_PSEUDO %1:vreg_64, %0:vreg_64, 337, 15, 15, 1, implicit $exec
+    %3:vreg_64 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec
+...
+
+# GCN-LABEL: name: dpp64_old_undef
+# GCN: %3:vreg_64 = V_CEIL_F64_dpp undef %1:vreg_64, 0, undef %2:vreg_64, 337, 15, 15, 1, implicit $mode, implicit $exec
+---
+name: dpp64_old_undef
+tracksRegLiveness: true
+body: |
+  bb.0:
+    %2:vreg_64 = V_MOV_B64_DPP_PSEUDO undef %1:vreg_64, undef %0:vreg_64, 337, 15, 15, 1, implicit $exec
+    %3:vreg_64 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec
+...
+
+# GCN-LABEL: name: dpp64_old_is_0
+# GCN: %3:vreg_64 = V_CEIL_F64_dpp %4, 0, undef %2:vreg_64, 337, 15, 15, 1, implicit $mode, implicit $exec
+name: dpp64_old_is_0
+tracksRegLiveness: true
+body: |
+  bb.0:
+    %1:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+    %2:vreg_64 = V_MOV_B64_DPP_PSEUDO undef %1, undef %0:vreg_64, 337, 15, 15, 1, implicit $exec
+    %3:vreg_64 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec
+...
+
+# DPP64 does not support all control values and must be split to become legal
+# GCN-LABEL: name: dpp64_illegal_ctrl
+# GCN: %4:vgpr_32 = V_MOV_B32_dpp undef %1.sub0:vreg_64, undef %2.sub0:vreg_64, 1, 15, 15, 1, implicit $exec
+# GCN: %5:vgpr_32 = V_MOV_B32_dpp undef %1.sub1:vreg_64, undef %2.sub1:vreg_64, 1, 15, 15, 1, implicit $exec
+# GCN: %0:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %5, %subreg.sub1
+# GCN: %3:vreg_64 = V_CEIL_F64_e32 %0, implicit $mode, implicit $exec
+name: dpp64_illegal_ctrl
+tracksRegLiveness: true
+body: |
+  bb.0:
+    %2:vreg_64 = V_MOV_B64_DPP_PSEUDO undef %1:vreg_64, undef %0:vreg_64, 1, 15, 15, 1, implicit $exec
+    %3:vreg_64 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
index 85b068bb212f..56423f7519d7 100644
--- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
@@ -52,6 +52,7 @@
 ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX906 %s
 ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx908 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX908 %s
 ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx909 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX909 %s
+; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx90a < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX90A %s
 ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx90c < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX90C %s
 ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1010 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1010 %s
 ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1011 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1011 %s
@@ -109,6 +110,7 @@
 ; GFX906:        EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F)
 ; GFX908:        EF_AMDGPU_MACH_AMDGCN_GFX908 (0x30)
 ; GFX909:        EF_AMDGPU_MACH_AMDGCN_GFX909 (0x31)
+; GFX90A:        EF_AMDGPU_MACH_AMDGCN_GFX90A (0x3F)
 ; GFX90C:        EF_AMDGPU_MACH_AMDGCN_GFX90C (0x32)
 ; GFX1010:       EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33)
 ; GFX1011:       EF_AMDGPU_MACH_AMDGCN_GFX1011 (0x34)

diff  --git a/llvm/test/CodeGen/AMDGPU/endpgm-dce.mir b/llvm/test/CodeGen/AMDGPU/endpgm-dce.mir
index 3c0c5715420e..4a1b03df8952 100644
--- a/llvm/test/CodeGen/AMDGPU/endpgm-dce.mir
+++ b/llvm/test/CodeGen/AMDGPU/endpgm-dce.mir
@@ -17,7 +17,7 @@ body:             |
     %0 = IMPLICIT_DEF
     %3 = IMPLICIT_DEF
     $sgpr0_sgpr1 = S_OR_B64 $exec, killed $vcc, implicit-def $scc
-    %1 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    %1 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
     %2 = V_ADD_F32_e64 0, killed %1, 0, 1, 0, 0, implicit $mode, implicit $exec
     %4 = S_ADD_U32 %3, 1, implicit-def $scc
     S_ENDPGM 0
@@ -25,7 +25,7 @@ body:             |
 ---
 # GCN-LABEL: name: load_without_memoperand
 # GCN:      $sgpr0_sgpr1 = S_OR_B64 $exec, killed $vcc, implicit-def $scc
-# GCN-NEXT: dead %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+# GCN-NEXT: dead %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 # GCN-NEXT: S_ENDPGM 0
 name: load_without_memoperand
 tracksRegLiveness: true
@@ -41,7 +41,7 @@ body:             |
     %0 = IMPLICIT_DEF
     %3 = IMPLICIT_DEF
     $sgpr0_sgpr1 = S_OR_B64 $exec, killed $vcc, implicit-def $scc
-    %1 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    %1 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     %2 = V_ADD_F32_e64 0, killed %1, 0, 1, 0, 0, implicit $mode, implicit $exec
     %4 = S_ADD_U32 %3, 1, implicit-def $scc
     S_ENDPGM 0
@@ -49,7 +49,7 @@ body:             |
 ---
 # GCN-LABEL: name: load_volatile
 # GCN:      $sgpr0_sgpr1 = S_OR_B64 $exec, killed $vcc, implicit-def $scc
-# GCN-NEXT: dead %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4)
+# GCN-NEXT: dead %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4)
 # GCN-NEXT: S_ENDPGM 0
 name: load_volatile
 tracksRegLiveness: true
@@ -65,7 +65,7 @@ body:             |
     %0 = IMPLICIT_DEF
     %3 = IMPLICIT_DEF
     $sgpr0_sgpr1 = S_OR_B64 $exec, killed $vcc, implicit-def $scc
-    %1 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4)
+    %1 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4)
     %2 = V_ADD_F32_e64 0, killed %1, 0, 1, 0, 0, implicit $mode, implicit $exec
     %4 = S_ADD_U32 %3, 1, implicit-def $scc
     S_ENDPGM 0
@@ -73,7 +73,7 @@ body:             |
 ---
 # GCN-LABEL: name: store
 # GCN:      $sgpr0_sgpr1 = S_OR_B64 $exec, killed $vcc, implicit-def $scc
-# GCN-NEXT: FLAT_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+# GCN-NEXT: FLAT_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
 # GCN-NEXT: S_ENDPGM 0
 name: store
 tracksRegLiveness: true
@@ -86,7 +86,7 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
     $sgpr0_sgpr1 = S_OR_B64 $exec, killed $vcc, implicit-def $scc
-    FLAT_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    FLAT_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     S_ENDPGM 0
 ...
 ---

diff  --git a/llvm/test/CodeGen/AMDGPU/expand-si-indirect.mir b/llvm/test/CodeGen/AMDGPU/expand-si-indirect.mir
index f25bb852f31f..2b8435405d86 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-si-indirect.mir
+++ b/llvm/test/CodeGen/AMDGPU/expand-si-indirect.mir
@@ -53,49 +53,49 @@ body:             |
     %28:vgpr_32 = COPY %23.sub13
     %29:vgpr_32 = COPY %23.sub12
     %30:vreg_128 = REG_SEQUENCE killed %29, %subreg.sub0, killed %28, %subreg.sub1, killed %27, %subreg.sub2, killed %26, %subreg.sub3
-    GLOBAL_STORE_DWORDX4_SADDR %1, killed %30, %2, 48, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4_SADDR %1, killed %30, %2, 48, 0, 0, 0, 0, implicit $exec
     %31:vgpr_32 = COPY %23.sub11
     %32:vgpr_32 = COPY %23.sub10
     %33:vgpr_32 = COPY %23.sub9
     %34:vgpr_32 = COPY %23.sub8
     %35:vreg_128 = REG_SEQUENCE killed %34, %subreg.sub0, killed %33, %subreg.sub1, killed %32, %subreg.sub2, killed %31, %subreg.sub3
-    GLOBAL_STORE_DWORDX4_SADDR %1, killed %35, %2, 32, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4_SADDR %1, killed %35, %2, 32, 0, 0, 0, 0, implicit $exec
     %36:vgpr_32 = COPY %23.sub7
     %37:vgpr_32 = COPY %23.sub6
     %38:vgpr_32 = COPY %23.sub5
     %39:vgpr_32 = COPY %23.sub4
     %40:vreg_128 = REG_SEQUENCE killed %39, %subreg.sub0, killed %38, %subreg.sub1, killed %37, %subreg.sub2, killed %36, %subreg.sub3
-    GLOBAL_STORE_DWORDX4_SADDR %1, killed %40, %2, 16, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4_SADDR %1, killed %40, %2, 16, 0, 0, 0, 0, implicit $exec
     %41:vgpr_32 = COPY %23.sub3
     %42:vgpr_32 = COPY %23.sub2
     %43:vgpr_32 = COPY %23.sub1
     %44:vgpr_32 = COPY killed %23.sub0
     %45:vreg_128 = REG_SEQUENCE killed %44, %subreg.sub0, killed %43, %subreg.sub1, killed %42, %subreg.sub2, killed %41, %subreg.sub3
-    GLOBAL_STORE_DWORDX4_SADDR %1, killed %45, %2, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4_SADDR %1, killed %45, %2, 0, 0, 0, 0, 0, implicit $exec
     %46:vgpr_32 = COPY %25.sub15
     %47:vgpr_32 = COPY %25.sub14
     %48:vgpr_32 = COPY %25.sub13
     %49:vgpr_32 = COPY %25.sub12
     %50:vreg_128 = REG_SEQUENCE killed %49, %subreg.sub0, killed %48, %subreg.sub1, killed %47, %subreg.sub2, killed %46, %subreg.sub3
-    GLOBAL_STORE_DWORDX4_SADDR %1, killed %50, %2, 112, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4_SADDR %1, killed %50, %2, 112, 0, 0, 0, 0, implicit $exec
     %51:vgpr_32 = COPY %25.sub11
     %52:vgpr_32 = COPY %25.sub10
     %53:vgpr_32 = COPY %25.sub9
     %54:vgpr_32 = COPY %25.sub8
     %55:vreg_128 = REG_SEQUENCE killed %54, %subreg.sub0, killed %53, %subreg.sub1, killed %52, %subreg.sub2, killed %51, %subreg.sub3
-    GLOBAL_STORE_DWORDX4_SADDR %1, killed %55, %2, 96, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4_SADDR %1, killed %55, %2, 96, 0, 0, 0, 0, implicit $exec
     %56:vgpr_32 = COPY %25.sub7
     %57:vgpr_32 = COPY %25.sub6
     %58:vgpr_32 = COPY %25.sub5
     %59:vgpr_32 = COPY %25.sub4
     %60:vreg_128 = REG_SEQUENCE killed %59, %subreg.sub0, killed %58, %subreg.sub1, killed %57, %subreg.sub2, killed %56, %subreg.sub3
-    GLOBAL_STORE_DWORDX4_SADDR %1, killed %60, %2, 80, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4_SADDR %1, killed %60, %2, 80, 0, 0, 0, 0, implicit $exec
     %61:vgpr_32 = COPY %25.sub3
     %62:vgpr_32 = COPY %25.sub2
     %63:vgpr_32 = COPY %25.sub1
     %64:vgpr_32 = COPY killed %25.sub0
     %65:vreg_128 = REG_SEQUENCE killed %64, %subreg.sub0, killed %63, %subreg.sub1, killed %62, %subreg.sub2, killed %61, %subreg.sub3
-    GLOBAL_STORE_DWORDX4_SADDR killed %1, killed %65, killed %2, 64, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4_SADDR killed %1, killed %65, killed %2, 64, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
index 36ec2746a789..efc67993b55f 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
@@ -12,7 +12,7 @@ define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) {
   ; GCN:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GCN:   [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
   ; GCN:   [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
   ; GCN:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2
   ; GCN:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1
   ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0
@@ -21,7 +21,7 @@ define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) {
   ; GCN:   [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GCN:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
   ; GCN:   [[DEF3:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
-  ; GCN:   BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4)
   ; GCN:   S_ENDPGM 0
 main_body:
   %tmp25 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> undef, i32 undef, i32 0, i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir b/llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir
index 7fa8a98e69dd..83cfb1278acd 100644
--- a/llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir
+++ b/llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir
@@ -17,8 +17,8 @@ body:             |
   ; GCN: bb.1:
   ; GCN:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; GCN:   $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5)
-  ; GCN:   renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec
-  ; GCN:   GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec
+  ; GCN:   renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec
+  ; GCN:   GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec
   ; GCN:   S_CBRANCH_EXECZ %bb.1, implicit $exec
   ; GCN: bb.2:
   ; GCN:   S_ENDPGM 0
@@ -27,8 +27,8 @@ body:             |
     %0:vreg_64 = COPY $vgpr0_vgpr1
 
   bb.1:
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
     S_CBRANCH_EXECZ %bb.1, implicit $exec
 
   bb.2:
@@ -52,11 +52,11 @@ body:             |
   ; GCN: bb.1:
   ; GCN:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; GCN:   $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5)
-  ; GCN:   renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec
-  ; GCN:   GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, renamable $vgpr2, 0, 0, 0, 0, implicit $exec
-  ; GCN:   renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec
+  ; GCN:   renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec
+  ; GCN:   GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec
+  ; GCN:   renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec
   ; GCN:   SI_SPILL_V32_SAVE $vgpr2, %stack.1, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5)
-  ; GCN:   GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, renamable $vgpr2, 0, 0, 0, 0, implicit $exec
+  ; GCN:   GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec
   ; GCN:   S_CBRANCH_EXECZ %bb.1, implicit $exec
   ; GCN: bb.2:
   ; GCN:   S_ENDPGM 0
@@ -65,10 +65,10 @@ body:             |
     %0:vreg_64 = COPY $vgpr0_vgpr1
 
   bb.1:
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
     S_CBRANCH_EXECZ %bb.1, implicit $exec
 
   bb.2:
@@ -96,7 +96,7 @@ body:             |
   ; GCN:   $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5)
   ; GCN:   renamable $vgpr2 = V_ADD_U32_e32 1, undef $vgpr0, implicit $exec
   ; GCN:   SI_SPILL_V32_SAVE $vgpr2, %stack.1, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5)
-  ; GCN:   GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, renamable $vgpr2, 0, 0, 0, 0, implicit $exec
+  ; GCN:   GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec
   ; GCN:   S_CBRANCH_EXECZ %bb.1, implicit $exec
   ; GCN: bb.2:
   ; GCN:   S_ENDPGM 0
@@ -106,7 +106,7 @@ body:             |
 
   bb.1:
     %1:vgpr_32 = V_ADD_U32_e32 1, undef %1, implicit $exec
-    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
     S_CBRANCH_EXECZ %bb.1, implicit $exec
 
   bb.2:
@@ -130,7 +130,7 @@ body:             |
   ; GCN: bb.1:
   ; GCN:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; GCN:   $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5)
-  ; GCN:   GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, undef renamable $vgpr0, 0, 0, 0, 0, implicit $exec
+  ; GCN:   GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, undef renamable $vgpr0, 0, 0, 0, 0, 0, implicit $exec
   ; GCN:   renamable $vgpr0 = V_ADD_U32_e64 1, 1, 0, implicit $exec
   ; GCN:   SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5)
   ; GCN:   S_CBRANCH_EXECZ %bb.1, implicit $exec
@@ -141,7 +141,7 @@ body:             |
     %0:vreg_64 = COPY $vgpr0_vgpr1
 
   bb.1:
-    GLOBAL_STORE_DWORD %0, undef %1, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, undef %1, 0, 0, 0, 0, 0, implicit $exec
     %1:vgpr_32 = V_ADD_U32_e64 1, 1, 0, implicit $exec
     S_CBRANCH_EXECZ %bb.1, implicit $exec
 
@@ -166,8 +166,8 @@ body:             |
   ; GCN: bb.1:
   ; GCN:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; GCN:   $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5)
-  ; GCN:   undef renamable $vgpr3 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit-def dead $vgpr2_vgpr3
-  ; GCN:   GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, undef renamable $vgpr1, 0, 0, 0, 0, implicit $exec
+  ; GCN:   undef renamable $vgpr3 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit-def dead $vgpr2_vgpr3
+  ; GCN:   GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, undef renamable $vgpr1, 0, 0, 0, 0, 0, implicit $exec
   ; GCN:   S_CBRANCH_EXECZ %bb.1, implicit $exec
   ; GCN: bb.2:
   ; GCN:   S_ENDPGM 0
@@ -176,8 +176,8 @@ body:             |
     %0:vreg_64 = COPY $vgpr0_vgpr1
 
   bb.1:
-    undef %1.sub1:vreg_64 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD %0, undef %1.sub1, 0, 0, 0, 0, implicit $exec
+    undef %1.sub1:vreg_64 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, undef %1.sub1, 0, 0, 0, 0, 0, implicit $exec
     S_CBRANCH_EXECZ %bb.1, implicit $exec
 
   bb.2:

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-load-clustering.mir b/llvm/test/CodeGen/AMDGPU/flat-load-clustering.mir
index cfa623fa1ebf..c9bf75ef681f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-load-clustering.mir
+++ b/llvm/test/CodeGen/AMDGPU/flat-load-clustering.mir
@@ -61,17 +61,17 @@ body:             |
     undef %12.sub0 = V_ADD_CO_U32_e32 %4.sub0, %7, implicit-def $vcc, implicit $exec
     %11 = COPY %4.sub1
     %12.sub1 = V_ADDC_U32_e32 %11, %2, implicit-def dead $vcc, implicit killed $vcc, implicit $exec
-    %5 = FLAT_LOAD_DWORD %12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.gep1)
+    %5 = FLAT_LOAD_DWORD %12, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.gep1)
     undef %9.sub0 = V_ADD_CO_U32_e32 %3.sub0, %7, implicit-def $vcc, implicit $exec
     %8 = COPY %3.sub1
     %9.sub1 = V_ADDC_U32_e32 %8, %2, implicit-def dead $vcc, implicit killed $vcc, implicit $exec
     undef %13.sub0 = V_ADD_CO_U32_e32 16, %12.sub0, implicit-def $vcc, implicit $exec
     %13.sub1 = V_ADDC_U32_e32 %12.sub1, %2, implicit-def dead $vcc, implicit killed $vcc, implicit $exec
-    %6 = FLAT_LOAD_DWORD %13, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.gep34)
+    %6 = FLAT_LOAD_DWORD %13, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.gep34)
     undef %10.sub0 = V_ADD_CO_U32_e32 16, %9.sub0, implicit-def $vcc, implicit $exec
     %10.sub1 = V_ADDC_U32_e32 %9.sub1, %2, implicit-def dead $vcc, implicit killed $vcc, implicit $exec
-    FLAT_STORE_DWORD %9, %5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.gep2)
-    FLAT_STORE_DWORD %10, %6, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.gep4)
+    FLAT_STORE_DWORD %9, %5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.gep2)
+    FLAT_STORE_DWORD %10, %6, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.gep4)
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi.mir b/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi.mir
index 37cec99ae0ac..5a76164a6706 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi.mir
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi.mir
@@ -9,10 +9,10 @@ body:             |
   bb.0.entry:
     ; GCN-LABEL: name: test_fold_fi_scratch_load_vgpr
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
-    ; GCN: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %stack.0, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+    ; GCN: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %stack.0, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
     ; GCN: S_ENDPGM 0
     %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
-    %1:vgpr_32 = SCRATCH_LOAD_DWORD %0:vgpr_32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+    %1:vgpr_32 = SCRATCH_LOAD_DWORD %0:vgpr_32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
     S_ENDPGM 0
 
 ...
@@ -25,10 +25,10 @@ body:             |
   bb.0.entry:
     ; GCN-LABEL: name: test_fold_fi_scratch_load_sgpr
     ; GCN: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 %stack.0
-    ; GCN: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %stack.0, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+    ; GCN: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %stack.0, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
     ; GCN: S_ENDPGM 0
     %0:sgpr_32 = S_MOV_B32 %stack.0
-    %1:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %0:sgpr_32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+    %1:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %0:sgpr_32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
     S_ENDPGM 0
 
 ...
@@ -42,11 +42,11 @@ body:             |
     ; GCN-LABEL: name: test_fold_fi_scratch_store_vgpr
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; GCN: SCRATCH_STORE_DWORD_SADDR [[DEF]], %stack.0, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5)
+    ; GCN: SCRATCH_STORE_DWORD_SADDR [[DEF]], %stack.0, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5)
     ; GCN: S_ENDPGM 0
     %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     %1:vgpr_32 = IMPLICIT_DEF
-    SCRATCH_STORE_DWORD %1:vgpr_32, %0:vgpr_32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5)
+    SCRATCH_STORE_DWORD %1:vgpr_32, %0:vgpr_32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5)
     S_ENDPGM 0
 
 ...
@@ -60,11 +60,11 @@ body:             |
     ; GCN-LABEL: name: test_no_fold_fi_scratch_store_vgpr
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; GCN: SCRATCH_STORE_DWORD [[V_MOV_B32_e32_]], [[DEF]], 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5)
+    ; GCN: SCRATCH_STORE_DWORD [[V_MOV_B32_e32_]], [[DEF]], 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5)
     ; GCN: S_ENDPGM 0
     %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     %1:vgpr_32 = IMPLICIT_DEF
-    SCRATCH_STORE_DWORD %0:vgpr_32, %1:vgpr_32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5)
+    SCRATCH_STORE_DWORD %0:vgpr_32, %1:vgpr_32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5)
     S_ENDPGM 0
 
 ...
@@ -78,11 +78,11 @@ body:             |
     ; GCN-LABEL: name: test_fold_fi_scratch_store_sgpr
     ; GCN: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 %stack.0
     ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; GCN: SCRATCH_STORE_DWORD_SADDR [[DEF]], %stack.0, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5)
+    ; GCN: SCRATCH_STORE_DWORD_SADDR [[DEF]], %stack.0, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5)
     ; GCN: S_ENDPGM 0
     %0:sgpr_32 = S_MOV_B32 %stack.0
     %1:vgpr_32 = IMPLICIT_DEF
-    SCRATCH_STORE_DWORD_SADDR %1:vgpr_32, %0:sgpr_32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5)
+    SCRATCH_STORE_DWORD_SADDR %1:vgpr_32, %0:sgpr_32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5)
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/fma.f64.ll b/llvm/test/CodeGen/AMDGPU/fma.f64.ll
index 4bc8f5ec4636..b98ee9c81e8a 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f64.ll
@@ -1,5 +1,6 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx90a -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A -check-prefix=FUNC %s
 
 declare double @llvm.fma.f64(double, double, double) nounwind readnone
 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
@@ -8,6 +9,7 @@ declare double @llvm.fabs.f64(double) nounwind readnone
 
 ; FUNC-LABEL: {{^}}fma_f64:
 ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2, double addrspace(1)* %in3) {
    %r0 = load double, double addrspace(1)* %in1
@@ -21,6 +23,8 @@ define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1
 ; FUNC-LABEL: {{^}}fma_v2f64:
 ; SI: v_fma_f64
 ; SI: v_fma_f64
+; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                        <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) {
    %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
@@ -36,6 +40,10 @@ define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x doubl
 ; SI: v_fma_f64
 ; SI: v_fma_f64
 ; SI: v_fma_f64
+; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
                        <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) {
    %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1
@@ -48,6 +56,7 @@ define amdgpu_kernel void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x doubl
 
 ; FUNC-LABEL: {{^}}fma_f64_abs_src0:
 ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, v\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @fma_f64_abs_src0(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2, double addrspace(1)* %in3) {
    %r0 = load double, double addrspace(1)* %in1
@@ -61,6 +70,7 @@ define amdgpu_kernel void @fma_f64_abs_src0(double addrspace(1)* %out, double ad
 
 ; FUNC-LABEL: {{^}}fma_f64_abs_src1:
 ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}}
+; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|}}
 define amdgpu_kernel void @fma_f64_abs_src1(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2, double addrspace(1)* %in3) {
    %r0 = load double, double addrspace(1)* %in1
@@ -74,6 +84,7 @@ define amdgpu_kernel void @fma_f64_abs_src1(double addrspace(1)* %out, double ad
 
 ; FUNC-LABEL: {{^}}fma_f64_abs_src2:
 ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|}}
+; GFX90A: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|}}
 define amdgpu_kernel void @fma_f64_abs_src2(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2, double addrspace(1)* %in3) {
    %r0 = load double, double addrspace(1)* %in1
@@ -87,6 +98,7 @@ define amdgpu_kernel void @fma_f64_abs_src2(double addrspace(1)* %out, double ad
 
 ; FUNC-LABEL: {{^}}fma_f64_neg_src0:
 ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @fma_f64_neg_src0(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2, double addrspace(1)* %in3) {
    %r0 = load double, double addrspace(1)* %in1
@@ -100,6 +112,7 @@ define amdgpu_kernel void @fma_f64_neg_src0(double addrspace(1)* %out, double ad
 
 ; FUNC-LABEL: {{^}}fma_f64_neg_src1:
 ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @fma_f64_neg_src1(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2, double addrspace(1)* %in3) {
    %r0 = load double, double addrspace(1)* %in1
@@ -113,6 +126,7 @@ define amdgpu_kernel void @fma_f64_neg_src1(double addrspace(1)* %out, double ad
 
 ; FUNC-LABEL: {{^}}fma_f64_neg_src2:
 ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; GFX90A: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @fma_f64_neg_src2(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2, double addrspace(1)* %in3) {
    %r0 = load double, double addrspace(1)* %in1
@@ -126,6 +140,7 @@ define amdgpu_kernel void @fma_f64_neg_src2(double addrspace(1)* %out, double ad
 
 ; FUNC-LABEL: {{^}}fma_f64_abs_neg_src0:
 ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @fma_f64_abs_neg_src0(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2, double addrspace(1)* %in3) {
    %r0 = load double, double addrspace(1)* %in1
@@ -140,6 +155,7 @@ define amdgpu_kernel void @fma_f64_abs_neg_src0(double addrspace(1)* %out, doubl
 
 ; FUNC-LABEL: {{^}}fma_f64_abs_neg_src1:
 ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}}
+; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}}
 define amdgpu_kernel void @fma_f64_abs_neg_src1(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2, double addrspace(1)* %in3) {
    %r0 = load double, double addrspace(1)* %in1
@@ -154,6 +170,7 @@ define amdgpu_kernel void @fma_f64_abs_neg_src1(double addrspace(1)* %out, doubl
 
 ; FUNC-LABEL: {{^}}fma_f64_abs_neg_src2:
 ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}}
+; GFX90A: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}}
 define amdgpu_kernel void @fma_f64_abs_neg_src2(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2, double addrspace(1)* %in3) {
    %r0 = load double, double addrspace(1)* %in1
@@ -168,6 +185,7 @@ define amdgpu_kernel void @fma_f64_abs_neg_src2(double addrspace(1)* %out, doubl
 
 ; FUNC-LABEL: {{^}}fma_f64_lit_src0:
 ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}}
+; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @fma_f64_lit_src0(double addrspace(1)* %out,
                      double addrspace(1)* %in2, double addrspace(1)* %in3) {
    %r1 = load double, double addrspace(1)* %in2
@@ -179,6 +197,7 @@ define amdgpu_kernel void @fma_f64_lit_src0(double addrspace(1)* %out,
 
 ; FUNC-LABEL: {{^}}fma_f64_lit_src1:
 ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}}
+; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @fma_f64_lit_src1(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in3) {
    %r0 = load double, double addrspace(1)* %in1
@@ -190,6 +209,7 @@ define amdgpu_kernel void @fma_f64_lit_src1(double addrspace(1)* %out, double ad
 
 ; FUNC-LABEL: {{^}}fma_f64_lit_src2:
 ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], 2.0}}
+; GFX90A: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], 2.0}}
 define amdgpu_kernel void @fma_f64_lit_src2(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1

diff  --git a/llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir b/llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir
index 2e55cac20cbc..038f9532ac43 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir
@@ -22,13 +22,13 @@ body:             |
     ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
-    ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
     ; GCN: SI_RETURN_TO_EPILOG $vgpr0
     %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
     %1:sreg_32_xm0 = S_MOV_B32 0
     %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
-    %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, 0, implicit $exec
+    %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr0 = COPY %3
     SI_RETURN_TO_EPILOG $vgpr0
 
@@ -54,12 +54,12 @@ body:             |
     ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
-    ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
     ; GCN: SI_RETURN_TO_EPILOG $vgpr0
     %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
     %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
-    %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr0 = COPY %3
     SI_RETURN_TO_EPILOG $vgpr0
 
@@ -83,16 +83,16 @@ body:             |
     ; GCN-LABEL: name: kernel_no_fold_fi_non_stack_soffset
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
-    ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
     ; GCN: S_ENDPGM 0, implicit $vgpr0
     %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
     %2:sreg_32_xm0 = S_MOV_B32 0
 
-    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, %2, 0, 0, 0, 0, 0, 0, implicit $exec
-    %3:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, %2, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %3:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr0 = COPY %3
     S_ENDPGM 0, implicit $vgpr0
 
@@ -115,15 +115,15 @@ body:             |
 
     ; GCN-LABEL: name: kernel_fold_fi_mubuf
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
-    ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
     ; GCN: S_ENDPGM 0, implicit $vgpr0
     %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
 
-    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr0 = COPY %2
     S_ENDPGM 0, implicit $vgpr0
 
@@ -152,13 +152,13 @@ body:             |
     ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
-    ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
     ; GCN: SI_RETURN_TO_EPILOG $vgpr0
     %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
     %1:sreg_32_xm0 = S_MOV_B32 0
     %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
-    %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, 0, implicit $exec
+    %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr0 = COPY %3
     SI_RETURN_TO_EPILOG $vgpr0
 
@@ -185,12 +185,12 @@ body:             |
     ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
-    ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
     ; GCN: SI_RETURN_TO_EPILOG $vgpr0
     %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
     %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
-    %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr0 = COPY %3
     SI_RETURN_TO_EPILOG $vgpr0
 
@@ -214,15 +214,15 @@ body:             |
 
     ; GCN-LABEL: name: function_no_fold_fi_non_stack_soffset
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
-    ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
     ; GCN: S_ENDPGM 0, implicit $vgpr0
     %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
 
-    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr0 = COPY %2
     S_ENDPGM 0, implicit $vgpr0
 
@@ -246,15 +246,15 @@ body:             |
 
     ; GCN-LABEL: name: function_fold_fi_mubuf_wave_relative
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
-    ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
     ; GCN: S_ENDPGM 0, implicit $vgpr0
     %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
 
-    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr0 = COPY %2
     S_ENDPGM 0, implicit $vgpr0
 
@@ -278,15 +278,15 @@ body:             |
 
     ; GCN-LABEL: name: function_fold_fi_mubuf_stack_relative
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
-    ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
     ; GCN: S_ENDPGM 0, implicit $vgpr0
     %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
 
-    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr0 = COPY %2
     S_ENDPGM 0, implicit $vgpr0
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir
index e921248cc325..399b55eea3b4 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir
@@ -17,7 +17,7 @@ body:             |
     %4:vgpr_32 = V_LSHLREV_B32_e64 killed %3, %0, implicit $exec
     %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     %6:vreg_64 = REG_SEQUENCE killed %4, %subreg.sub0, killed %5, %subreg.sub1
-    %7:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %6, %2, 0, 4, 0, 0, 0, 0, 0, implicit $exec
+    %7:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %6, %2, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec
     %8:sreg_32_xm0 = S_MOV_B32 65535
     %9:vgpr_32 = COPY %8
     %10:vgpr_32 = V_AND_B32_e32 %7, %9, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir
index b81556c94cce..c2f50953dcbf 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir
@@ -158,10 +158,10 @@ body:             |
     %8 = S_MOV_B32 61440
     %9 = S_MOV_B32 -1
     %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
-    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
     %12 = V_MOV_B32_e32 1065353216, implicit $exec
     %13 = V_ADD_F16_e64 0, killed %11, 0, %12, 0, 0, implicit $mode, implicit $exec
-    BUFFER_STORE_SHORT_OFFSET killed %13, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %13, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
     S_ENDPGM 0
 
 ...
@@ -222,13 +222,13 @@ body:             |
     %8 = S_MOV_B32 61440
     %9 = S_MOV_B32 -1
     %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
-    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
-    %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
     %13 = V_MOV_B32_e32 1065353216, implicit $exec
     %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit $mode, implicit $exec
     %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit $mode, implicit $exec
-    BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
-    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
     S_ENDPGM 0
 
 ...
@@ -289,14 +289,14 @@ body:             |
     %8 = S_MOV_B32 61440
     %9 = S_MOV_B32 -1
     %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
-    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
-    %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
-    %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
     %14 = V_MOV_B32_e32 1065353216, implicit $exec
     %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit $mode, implicit $exec
     %16 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit $mode, implicit $exec
-    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
-    BUFFER_STORE_DWORD_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`)
     S_ENDPGM 0
 
 ...
@@ -360,16 +360,16 @@ body:             |
     %8 = S_MOV_B32 61440
     %9 = S_MOV_B32 -1
     %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
-    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
-    %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
-    %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
     %14 = V_MOV_B32_e32 1065353216, implicit $exec
     %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit $mode, implicit $exec
     %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit $mode, implicit $exec
     %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit $mode, implicit $exec
-    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
-    BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
-    BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`)
     S_ENDPGM 0
 
 ...
@@ -427,13 +427,13 @@ body:             |
     %8 = S_MOV_B32 61440
     %9 = S_MOV_B32 -1
     %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
-    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
-    %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
     %13 = V_MOV_B32_e32 1, implicit $exec
     %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit $mode, implicit $exec
     %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit $mode, implicit $exec
-    BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
-    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
     S_ENDPGM 0
 
 ...
@@ -494,16 +494,16 @@ body:             |
     %8 = S_MOV_B32 61440
     %9 = S_MOV_B32 -1
     %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
-    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
-    %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
-    %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
     %14 = V_MOV_B32_e32 -2, implicit $exec
     %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit $mode, implicit $exec
     %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit $mode, implicit $exec
     %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit $mode, implicit $exec
-    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
-    BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
-    BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`)
     S_ENDPGM 0
 
 ...
@@ -564,13 +564,13 @@ body:             |
     %8 = S_MOV_B32 61440
     %9 = S_MOV_B32 -1
     %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
-    %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
-    %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
     %13 = V_MOV_B32_e32 15360, implicit $exec
     %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit $mode, implicit $exec
     %15 = V_ADD_F32_e64 0, %12, 0, %13, 0, 0, implicit $mode, implicit $exec
-    BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`)
-    BUFFER_STORE_DWORD_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`)
     S_ENDPGM 0
 
 ...
@@ -631,13 +631,13 @@ body:             |
     %8 = S_MOV_B32 61440
     %9 = S_MOV_B32 -1
     %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
-    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
-    %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
     %13 = V_MOV_B32_e32 80886784, implicit $exec
     %14 = V_ADD_F16_e64 0, %11, 0, %13, 0, 0, implicit $mode, implicit $exec
     %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit $mode, implicit $exec
-    BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
-    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
     S_ENDPGM 0
 
 ...
@@ -697,13 +697,13 @@ body:             |
     %8 = S_MOV_B32 61440
     %9 = S_MOV_B32 -1
     %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
-    %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
-    %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
     %13 = V_MOV_B32_e32 305413120, implicit $exec
     %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit $mode, implicit $exec
     %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit $mode, implicit $exec
-    BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`)
-    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir b/llvm/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir
index 08fbe48c364f..3745509a45e3 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir
@@ -60,13 +60,13 @@ body:             |
     %17 = REG_SEQUENCE killed %6, 17, %13, 18
     %18 = REG_SEQUENCE killed %4, 17, %13, 18
     %20 = COPY %29
-    %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %22 = COPY %29
-    %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %23 = V_MOV_B32_e32 1090519040, implicit $exec
     %24 = nofpexcept V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit $mode, implicit $exec
     %26 = COPY %29
-    BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -131,13 +131,13 @@ body:             |
     %17 = REG_SEQUENCE killed %6, 17, %13, 18
     %18 = REG_SEQUENCE killed %4, 17, %13, 18
     %20 = COPY %29
-    %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %22 = COPY %29
-    %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %23 = V_MOV_B32_e32 1090519040, implicit $exec
     %24 = nofpexcept V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 0, 2, implicit $mode, implicit $exec
     %26 = COPY %29
-    BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -202,13 +202,13 @@ body:             |
     %17 = REG_SEQUENCE killed %6, 17, %13, 18
     %18 = REG_SEQUENCE killed %4, 17, %13, 18
     %20 = COPY %29
-    %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %22 = COPY %29
-    %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %23 = V_MOV_B32_e32 1090519040, implicit $exec
     %24 = nofpexcept V_MAD_F32_e64 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit $mode, implicit $exec
     %26 = COPY %29
-    BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -273,13 +273,13 @@ body:             |
     %17 = REG_SEQUENCE killed %6, 17, %13, 18
     %18 = REG_SEQUENCE killed %4, 17, %13, 18
     %20 = COPY %29
-    %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %22 = COPY %29
-    %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %23 = V_MOV_B32_e32 1090519040, implicit $exec
     %24 = nofpexcept V_MAD_F32_e64 0, killed %19, 0, killed %21, 0, %23, 0, 1, implicit $mode, implicit $exec
     %26 = COPY %29
-    BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/fold-multiple.mir b/llvm/test/CodeGen/AMDGPU/fold-multiple.mir
index 1134b9a84302..fd3b88118288 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-multiple.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-multiple.mir
@@ -34,7 +34,7 @@ body:             |
     %3 = S_LSHL_B32 %1, killed %1, implicit-def dead $scc
     %4 = V_AND_B32_e64 killed %2, killed %3, implicit $exec
     %5 = IMPLICIT_DEF
-    BUFFER_STORE_DWORD_OFFSET killed %4, killed %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed %4, killed %5, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/fp-atomic-to-s_denormmode.mir b/llvm/test/CodeGen/AMDGPU/fp-atomic-to-s_denormmode.mir
index d09aa9aa3d20..e582acae4203 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomic-to-s_denormmode.mir
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomic-to-s_denormmode.mir
@@ -8,7 +8,7 @@
 name:            flat_atomic_fcmpswap_to_s_denorm_mode
 body:            |
   bb.0:
-    FLAT_ATOMIC_FCMPSWAP undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    FLAT_ATOMIC_FCMPSWAP undef %0:vreg_64, undef %1:vreg_64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -20,7 +20,7 @@ body:            |
 name:            flat_atomic_fcmpswap_x2_to_s_denorm_mode
 body:            |
   bb.0:
-    FLAT_ATOMIC_FCMPSWAP_X2 undef %0:vreg_64, undef %1:vreg_128, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    FLAT_ATOMIC_FCMPSWAP_X2 undef %0:vreg_64, undef %1:vreg_128, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -32,7 +32,7 @@ body:            |
 name:            flat_atomic_fmax_to_s_denorm_mode
 body:            |
   bb.0:
-    FLAT_ATOMIC_FMAX undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    FLAT_ATOMIC_FMAX undef %0:vreg_64, undef %1:vgpr_32, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -44,7 +44,7 @@ body:            |
 name:            flat_atomic_fmax_x2_to_s_denorm_mode
 body:            |
   bb.0:
-    FLAT_ATOMIC_FMAX_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    FLAT_ATOMIC_FMAX_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -56,7 +56,7 @@ body:            |
 name:            flat_atomic_fmin_to_s_denorm_mode
 body:            |
   bb.0:
-    FLAT_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    FLAT_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -68,7 +68,7 @@ body:            |
 name:            flat_atomic_fmin_x2_to_s_denorm_mode
 body:            |
   bb.0:
-    FLAT_ATOMIC_FMIN_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    FLAT_ATOMIC_FMIN_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -80,7 +80,7 @@ body:            |
 name:            flat_atomic_fcmpswap_x2_rtn_to_s_denorm_mode
 body:            |
   bb.0:
-    %2:vreg_64 = FLAT_ATOMIC_FCMPSWAP_X2_RTN undef %0:vreg_64, undef %1:vreg_128, 0, -1, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    %2:vreg_64 = FLAT_ATOMIC_FCMPSWAP_X2_RTN undef %0:vreg_64, undef %1:vreg_128, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -92,7 +92,7 @@ body:            |
 name:            flat_atomic_fmax_rtn_to_s_denorm_mode
 body:            |
   bb.0:
-    %2:vgpr_32 = FLAT_ATOMIC_FMAX_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, -1, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    %2:vgpr_32 = FLAT_ATOMIC_FMAX_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -104,7 +104,7 @@ body:            |
 name:            flat_atomic_fmax_x2_rtn_to_s_denorm_mode
 body:            |
   bb.0:
-    %2:vreg_64 = FLAT_ATOMIC_FMAX_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    %2:vreg_64 = FLAT_ATOMIC_FMAX_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -116,7 +116,7 @@ body:            |
 name:            flat_atomic_fmin_rtn_to_s_denorm_mode
 body:            |
   bb.0:
-    %2:vgpr_32 = FLAT_ATOMIC_FMIN_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, -1, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    %2:vgpr_32 = FLAT_ATOMIC_FMIN_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -128,7 +128,7 @@ body:            |
 name:            flat_atomic_fmin_x2_rtn_to_s_denorm_mode
 body:            |
   bb.0:
-    %2:vreg_64 = FLAT_ATOMIC_FMIN_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    %2:vreg_64 = FLAT_ATOMIC_FMIN_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -140,7 +140,7 @@ body:            |
 name:            flat_atomic_fcmpswap_rtn_to_s_denorm_mode
 body:            |
   bb.0:
-    %2:vgpr_32 = FLAT_ATOMIC_FCMPSWAP_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    %2:vgpr_32 = FLAT_ATOMIC_FCMPSWAP_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -152,7 +152,7 @@ body:            |
 name:            global_atomic_fcmpswap_to_s_denorm_mode
 body:            |
   bb.0:
-    GLOBAL_ATOMIC_FCMPSWAP undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    GLOBAL_ATOMIC_FCMPSWAP undef %0:vreg_64, undef %1:vgpr_32, 0, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -164,7 +164,7 @@ body:            |
 name:            global_atomic_fcmpswap_x2_to_s_denorm_mode
 body:            |
   bb.0:
-    GLOBAL_ATOMIC_FCMPSWAP_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    GLOBAL_ATOMIC_FCMPSWAP_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -176,7 +176,7 @@ body:            |
 name:            global_atomic_fmax_to_s_denorm_mode
 body:            |
   bb.0:
-    GLOBAL_ATOMIC_FMAX undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    GLOBAL_ATOMIC_FMAX undef %0:vreg_64, undef %1:vgpr_32, 0, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -188,7 +188,7 @@ body:            |
 name:            global_atomic_fmax_x2_to_s_denorm_mode
 body:            |
   bb.0:
-    GLOBAL_ATOMIC_FMAX_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    GLOBAL_ATOMIC_FMAX_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -200,7 +200,7 @@ body:            |
 name:            global_atomic_fmin_to_s_denorm_mode
 body:            |
   bb.0:
-    GLOBAL_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    GLOBAL_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -212,7 +212,7 @@ body:            |
 name:            global_atomic_fmin_x2_to_s_denorm_mode
 body:            |
   bb.0:
-    GLOBAL_ATOMIC_FMIN_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    GLOBAL_ATOMIC_FMIN_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -224,7 +224,7 @@ body:            |
 name:            global_atomic_fcmpswap_rtn_to_s_denorm_mode
 body:            |
   bb.0:
-    %2:vgpr_32 = GLOBAL_ATOMIC_FCMPSWAP_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    %2:vgpr_32 = GLOBAL_ATOMIC_FCMPSWAP_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -236,7 +236,7 @@ body:            |
 name:            global_atomic_fcmpswap_x2_rtn_to_s_denorm_mode
 body:            |
   bb.0:
-    %2:vreg_64 = GLOBAL_ATOMIC_FCMPSWAP_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    %2:vreg_64 = GLOBAL_ATOMIC_FCMPSWAP_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -248,7 +248,7 @@ body:            |
 name:            global_atomic_fmax_rtn_to_s_denorm_mode
 body:            |
   bb.0:
-    %2:vgpr_32 = GLOBAL_ATOMIC_FMAX_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    %2:vgpr_32 = GLOBAL_ATOMIC_FMAX_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -260,7 +260,7 @@ body:            |
 name:            global_atomic_fmax_x2_rtn_to_s_denorm_mode
 body:            |
   bb.0:
-    %2:vreg_64 = GLOBAL_ATOMIC_FMAX_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    %2:vreg_64 = GLOBAL_ATOMIC_FMAX_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -272,7 +272,7 @@ body:            |
 name:            global_atomic_fmin_rtn_to_s_denorm_mode
 body:            |
   bb.0:
-    %2:vgpr_32 = GLOBAL_ATOMIC_FMIN_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    %2:vgpr_32 = GLOBAL_ATOMIC_FMIN_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -284,7 +284,7 @@ body:            |
 name:            global_atomic_fmin_x2_rtn_to_s_denorm_mode
 body:            |
   bb.0:
-    %2:vreg_64 = GLOBAL_ATOMIC_FMIN_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    %2:vreg_64 = GLOBAL_ATOMIC_FMIN_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -296,7 +296,7 @@ body:            |
 name:            global_atomic_fcmpswap_saddr_to_s_denorm_mode
 body:            |
   bb.0:
-    GLOBAL_ATOMIC_FCMPSWAP_SADDR undef %0:vgpr_32, undef %1:vgpr_32, undef %3:sgpr_64, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    GLOBAL_ATOMIC_FCMPSWAP_SADDR undef %0:vgpr_32, undef %1:vgpr_32, undef %3:sgpr_64, 0, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -308,7 +308,7 @@ body:            |
 name:            global_atomic_fcmpswap_x2_saddr_rtn_to_s_denorm_mode
 body:            |
   bb.0:
-    %2:vreg_64 = GLOBAL_ATOMIC_FCMPSWAP_X2_SADDR_RTN undef %0:vgpr_32, undef %1:vreg_64, undef %3:sgpr_64, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    %2:vreg_64 = GLOBAL_ATOMIC_FCMPSWAP_X2_SADDR_RTN undef %0:vgpr_32, undef %1:vreg_64, undef %3:sgpr_64, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -320,7 +320,7 @@ body:            |
 name:            global_atomic_fmax_saddr_rtn_to_s_denorm_mode
 body:            |
   bb.0:
-    %2:vgpr_32 = GLOBAL_ATOMIC_FMAX_SADDR_RTN undef %0:vgpr_32, undef %1:vgpr_32, undef %3:sgpr_64, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    %2:vgpr_32 = GLOBAL_ATOMIC_FMAX_SADDR_RTN undef %0:vgpr_32, undef %1:vgpr_32, undef %3:sgpr_64, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -332,7 +332,7 @@ body:            |
 name:            global_atomic_fmax_x2_saddr_rtn_to_s_denorm_mode
 body:            |
   bb.0:
-    %2:vreg_64 = GLOBAL_ATOMIC_FMAX_X2_SADDR_RTN undef %0:vgpr_32, undef %1:vreg_64, undef %3:sgpr_64, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    %2:vreg_64 = GLOBAL_ATOMIC_FMAX_X2_SADDR_RTN undef %0:vgpr_32, undef %1:vreg_64, undef %3:sgpr_64, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -344,7 +344,7 @@ body:            |
 name:            global_atomic_fmin_saddr_rtn_to_s_denorm_mode
 body:            |
   bb.0:
-    %2:vgpr_32 = GLOBAL_ATOMIC_FMIN_SADDR_RTN undef %0:vgpr_32, undef %1:vgpr_32, undef %3:sgpr_64, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    %2:vgpr_32 = GLOBAL_ATOMIC_FMIN_SADDR_RTN undef %0:vgpr_32, undef %1:vgpr_32, undef %3:sgpr_64, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -356,7 +356,7 @@ body:            |
 name:            global_atomic_fmin_x2_saddr_rtn_to_s_denorm_mode
 body:            |
   bb.0:
-    %2:vreg_64 = GLOBAL_ATOMIC_FMIN_X2_SADDR_RTN undef %0:vgpr_32, undef %1:vreg_64, undef %3:sgpr_64, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    %2:vreg_64 = GLOBAL_ATOMIC_FMIN_X2_SADDR_RTN undef %0:vgpr_32, undef %1:vreg_64, undef %3:sgpr_64, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
@@ -368,7 +368,7 @@ body:            |
 name:            flat_fp_atomic_to_s_denorm_mode_waitcnt
 body:            |
   bb.0:
-    FLAT_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    FLAT_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_WAITCNT 0
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
@@ -381,7 +381,7 @@ body:            |
 name:            flat_fp_atomic_to_s_denorm_mode_valu
 body:            |
   bb.0:
-    FLAT_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
+    FLAT_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     %2:vgpr_32 = V_ADD_F32_e32 undef %1:vgpr_32, undef %1:vgpr_32, implicit $mode, implicit $exec
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
new file mode 100644
index 000000000000..691bc44e8bb5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -0,0 +1,737 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A
+
+declare double @llvm.amdgcn.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i1)
+declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg)
+declare double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32 immarg)
+declare double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32, i32 immarg)
+declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32 immarg)
+declare double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32, i32 immarg)
+declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg)
+declare double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+declare double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+declare double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %ptr, double %data)
+declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %ptr, double %data)
+declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %ptr, double %data)
+declare double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* nocapture, double, i32, i32, i1)
+
+define amdgpu_kernel void @buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: buffer_atomic_add_noret_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: buffer_atomic_add_rtn_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+  store double %ret, double* undef
+  ret void
+}
+
+define amdgpu_kernel void @buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) {
+; GFX90A-LABEL: buffer_atomic_add_rtn_f64_off4_slc:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s10, s[0:1], 0x3c
+; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v3, s[4:7], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
+  store double %ret, double addrspace(1)* %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  store double %ret, double* undef
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) {
+; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s10, s[0:1], 0x3c
+; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v3, s[4:7], 4 offen glc slc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
+  store double %ret, double addrspace(1)* %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
+  store double %ret, double* undef
+  ret void
+}
+
+define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) {
+; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s10, s[0:1], 0x3c
+; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v3, s[4:7], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
+  store double %ret, double addrspace(1)* %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  store double %ret, double* undef
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) {
+; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s10, s[0:1], 0x3c
+; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v3, s[4:7], 4 offen glc slc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
+  store double %ret, double addrspace(1)* %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
+  store double %ret, double* undef
+  ret void
+}
+
+define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) {
+; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s10, s[0:1], 0x3c
+; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v3, s[4:7], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
+  store double %ret, double addrspace(1)* %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  store double %ret, double* undef
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) {
+; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s10, s[0:1], 0x3c
+; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v3, s[4:7], 4 offen glc slc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
+  store double %ret, double addrspace(1)* %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
+  store double %ret, double* undef
+  ret void
+}
+
+define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) {
+; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX90A-NEXT:    s_load_dword s10, s[0:1], 0x3c
+; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v3, s[4:7], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
+  store double %ret, double addrspace(1)* %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_f64_noret(double addrspace(1)* %ptr, double %data) {
+; GFX90A-LABEL: global_atomic_fadd_f64_noret:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmin_f64_noret(double addrspace(1)* %ptr, double %data) {
+; GFX90A-LABEL: global_atomic_fmin_f64_noret:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NEXT:    global_atomic_min_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmax_f64_noret(double addrspace(1)* %ptr, double %data) {
+; GFX90A-LABEL: global_atomic_fmax_f64_noret:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NEXT:    global_atomic_max_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(double addrspace(1)* %ptr) #1 {
+; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1] scc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(double addrspace(1)* %ptr) #0 {
+; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT:    s_mov_b64 s[2:3], 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:  BB25_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
+; GFX90A-NEXT:    v_add_f64 v[2:3], v[0:1], 4.0
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v0
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[2:3], v6, v[2:5], s[0:1] glc scc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[0:1]
+; GFX90A-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT:    s_cbranch_execnz BB25_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst
+  ret void
+}
+
+define double @global_atomic_fadd_f64_rtn(double addrspace(1)* %ptr, double %data) {
+; GFX90A-LABEL: global_atomic_fadd_f64_rtn:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+main_body:
+  %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  ret double %ret
+}
+
+define double @global_atomic_fadd_f64_rtn_pat(double addrspace(1)* %ptr, double %data) #1 {
+; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc scc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+main_body:
+  %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst
+  ret double %ret
+}
+
+define double @global_atomic_fmax_f64_rtn(double addrspace(1)* %ptr, double %data) {
+; GFX90A-LABEL: global_atomic_fmax_f64_rtn:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+main_body:
+  %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  ret double %ret
+}
+
+define double @global_atomic_fmin_f64_rtn(double addrspace(1)* %ptr, double %data) {
+; GFX90A-LABEL: global_atomic_fmin_f64_rtn:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+main_body:
+  %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  ret double %ret
+}
+
+define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(double* %ptr) #1 {
+; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3] scc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = atomicrmw fadd double* %ptr, double 4.0 seq_cst
+  ret void
+}
+
+define double @flat_atomic_fadd_f64_rtn_pat(double* %ptr) #1 {
+; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc scc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+main_body:
+  %ret = atomicrmw fadd double* %ptr, double 4.0 seq_cst
+  ret double %ret
+}
+
+define amdgpu_kernel void @flat_atomic_fadd_f64_noret(double* %ptr, double %data) {
+; GFX90A-LABEL: flat_atomic_fadd_f64_noret:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %ptr, double %data)
+  ret void
+}
+
+define double @flat_atomic_fadd_f64_rtn(double* %ptr, double %data) {
+; GFX90A-LABEL: flat_atomic_fadd_f64_rtn:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+main_body:
+  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %ptr, double %data)
+  ret double %ret
+}
+
+define amdgpu_kernel void @flat_atomic_fmin_f64_noret(double* %ptr, double %data) {
+; GFX90A-LABEL: flat_atomic_fmin_f64_noret:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %ptr, double %data)
+  ret void
+}
+
+define double @flat_atomic_fmin_f64_rtn(double* %ptr, double %data) {
+; GFX90A-LABEL: flat_atomic_fmin_f64_rtn:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+main_body:
+  %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %ptr, double %data)
+  ret double %ret
+}
+
+define amdgpu_kernel void @flat_atomic_fmax_f64_noret(double* %ptr, double %data) {
+; GFX90A-LABEL: flat_atomic_fmax_f64_noret:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %ptr, double %data)
+  ret void
+}
+
+define double @flat_atomic_fmax_f64_rtn(double* %ptr, double %data) {
+; GFX90A-LABEL: flat_atomic_fmax_f64_rtn:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+main_body:
+  %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %ptr, double %data)
+  ret double %ret
+}
+
+define amdgpu_kernel void @local_atomic_fadd_f64_noret(double addrspace(3)* %ptr, double %data) {
+; GFX90A-LABEL: local_atomic_fadd_f64_noret:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    ds_add_f64 v2, v[0:1]
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* %ptr, double %data, i32 0, i32 0, i1 0)
+  ret void
+}
+
+define double @local_atomic_fadd_f64_rtn(double addrspace(3)* %ptr, double %data) {
+; GFX90A-LABEL: local_atomic_fadd_f64_rtn:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
+; GFX90A-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[4:5]
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+main_body:
+  %ret = call double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* %ptr, double %data, i32 0, i32 0, i1 0)
+  ret double %ret
+}
+
+define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(double addrspace(3)* %ptr) #1 {
+; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    ds_add_f64 v2, v[0:1]
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(double addrspace(3)* %ptr) #0 {
+; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX90A-NEXT:    s_mov_b64 s[2:3], 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    ds_read_b64 v[0:1], v0
+; GFX90A-NEXT:  BB41_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_add_f64 v[2:3], v[0:1], 4.0
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    ds_cmpst_rtn_b64 v[2:3], v4, v[0:1], v[2:3]
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[0:1]
+; GFX90A-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT:    s_cbranch_execnz BB41_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_endpgm
+main_body:
+  %ret = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst
+  ret void
+}
+
+define double @local_atomic_fadd_f64_rtn_pat(double addrspace(3)* %ptr, double %data) #1 {
+; GFX90A-LABEL: local_atomic_fadd_f64_rtn_pat:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[2:3]
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+main_body:
+  %ret = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst
+  ret double %ret
+}
+
+attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" }

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll b/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll
new file mode 100644
index 000000000000..8a3119ada24a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX908 %s
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
+
+; GFX9-DAG:   buffer_load_format_xyzw v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], 0 idxen ; encoding:
+; GFX9-DAG:   buffer_load_format_d16_xyzw v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], 0 idxen ; encoding:
+; GFX908-DAG: v_mfma_i32_4x4x4i8 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}] ; encoding: [{{0x..,0x0.,}}
+; GFX90A-DAG: v_mfma_i32_4x4x4i8 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}] ; encoding: [{{0x..,0x8.,}}
+define amdgpu_kernel void @test(<4 x i32> %x)  {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %r1 = tail call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %x, i32 %id, i32 0, i1 zeroext false, i1 zeroext false)
+  store volatile <4 x float> %r1, <4 x float>* undef
+  %r2 = tail call <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32> %x, i32 %id, i32 0, i1 zeroext false, i1 zeroext false)
+  store volatile <4 x half> %r2, <4 x half>* undef
+  %r3 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %x, i32 0, i32 0, i32 0)
+  store <4 x i32> %r3, <4 x i32>* undef
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1 immarg, i1 immarg)
+declare <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32>, i32, i32, i1 immarg, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
index 3ad855833b02..d8fe585a03e8 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
@@ -1,65 +1,346 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s
 
-; GCN-LABEL: {{^}}global_atomic_fadd_ret_f32:
-; GCN: [[LOOP:BB[0-9]+_[0-9]+]]
-; GCN: v_add_f32_e32
-; GCN: global_atomic_cmpswap
-; GCN: s_andn2_b64 exec, exec,
-; GCN-NEXT: s_cbranch_execnz [[LOOP]]
 define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) #0 {
+; GFX900-LABEL: global_atomic_fadd_ret_f32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX900-NEXT:    s_mov_b64 s[2:3], 0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, s4
+; GFX900-NEXT:  BB0_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    v_mov_b32_e32 v1, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX900-NEXT:    s_cbranch_execnz BB0_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX900-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-NEXT:    s_endpgm
+;
+; GFX908-LABEL: global_atomic_fadd_ret_f32:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX908-NEXT:    s_mov_b64 s[2:3], 0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v0, s4
+; GFX908-NEXT:  BB0_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    v_mov_b32_e32 v2, 0
+; GFX908-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX908-NEXT:    s_cbranch_execnz BB0_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX908-NEXT:    global_store_dword v[0:1], v0, off
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: global_atomic_fadd_ret_f32:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 4.0
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v0, v1, s[0:1] glc scc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    global_store_dword v[0:1], v0, off
+; GFX90A-NEXT:    s_endpgm
   %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
   store float %result, float addrspace(1)* undef
   ret void
 }
 
-; GCN-LABEL: {{^}}global_atomic_fadd_ret_f32_ieee:
-; GCN: [[LOOP:BB[0-9]+_[0-9]+]]
-; GCN: v_add_f32_e32
-; GCN: global_atomic_cmpswap
-; GCN: s_andn2_b64 exec, exec,
-; GCN-NEXT: s_cbranch_execnz [[LOOP]]
-define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(float addrspace(1)* %ptr) {
+define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(float addrspace(1)* %ptr) #2 {
+; GFX900-LABEL: global_atomic_fadd_ret_f32_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX900-NEXT:    s_mov_b64 s[2:3], 0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, s4
+; GFX900-NEXT:  BB1_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    v_mov_b32_e32 v1, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX900-NEXT:    s_cbranch_execnz BB1_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX900-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-NEXT:    s_endpgm
+;
+; GFX908-LABEL: global_atomic_fadd_ret_f32_ieee:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX908-NEXT:    s_mov_b64 s[2:3], 0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v0, s4
+; GFX908-NEXT:  BB1_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    v_mov_b32_e32 v2, 0
+; GFX908-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX908-NEXT:    s_cbranch_execnz BB1_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX908-NEXT:    global_store_dword v[0:1], v0, off
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: global_atomic_fadd_ret_f32_ieee:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT:    s_mov_b64 s[2:3], 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NEXT:  BB1_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_add_f32_e32 v0, 4.0, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v3
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
+; GFX90A-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT:    s_cbranch_execnz BB1_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX90A-NEXT:    global_store_dword v[0:1], v0, off
+; GFX90A-NEXT:    s_endpgm
   %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
   store float %result, float addrspace(1)* undef
   ret void
 }
 
-; GCN-LABEL: {{^}}global_atomic_fadd_noret_f32:
-; GFX900: [[LOOP:BB[0-9]+_[0-9]+]]
-; GFX900: v_add_f32_e32
-; GFX900: global_atomic_cmpswap
-; GFX900: s_andn2_b64 exec, exec,
-; GFX900-NEXT: s_cbranch_execnz [[LOOP]]
-
-; GFX908-NOT: v_add_f32
-; GFX908:     global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s
-; GFX908-NOT: s_cbranch_execnz
 define amdgpu_kernel void @global_atomic_fadd_noret_f32(float addrspace(1)* %ptr) #0 {
+; GFX900-LABEL: global_atomic_fadd_noret_f32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX900-NEXT:    s_mov_b64 s[2:3], 0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v1, s4
+; GFX900-NEXT:  BB2_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX900-NEXT:    v_mov_b32_e32 v1, v0
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX900-NEXT:    s_cbranch_execnz BB2_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_endpgm
+;
+; GFX908-LABEL: global_atomic_fadd_noret_f32:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX908-NEXT:    v_mov_b32_e32 v0, 0
+; GFX908-NEXT:    v_mov_b32_e32 v1, 4.0
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: global_atomic_fadd_noret_f32:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 4.0
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v1, s[0:1] scc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    s_endpgm
   %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
   ret void
 }
 
-; GCN-LABEL: {{^}}global_atomic_fadd_noret_f32_ieee:
-; GCN: global_atomic_cmpswap
-define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(float addrspace(1)* %ptr) {
+define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(float addrspace(1)* %ptr) #2 {
+; GFX900-LABEL: global_atomic_fadd_noret_f32_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX900-NEXT:    s_mov_b64 s[2:3], 0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v1, s4
+; GFX900-NEXT:  BB3_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX900-NEXT:    v_mov_b32_e32 v1, v0
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX900-NEXT:    s_cbranch_execnz BB3_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_endpgm
+;
+; GFX908-LABEL: global_atomic_fadd_noret_f32_ieee:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX908-NEXT:    s_mov_b64 s[2:3], 0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, s4
+; GFX908-NEXT:  BB3_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    v_mov_b32_e32 v2, 0
+; GFX908-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX908-NEXT:    s_cbranch_execnz BB3_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: global_atomic_fadd_noret_f32_ieee:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT:    s_mov_b64 s[2:3], 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NEXT:  BB3_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NEXT:    v_add_f32_e32 v2, 4.0, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_cmpswap v1, v1, v[2:3], s[0:1] glc scc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v0
+; GFX90A-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v1
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT:    s_cbranch_execnz BB3_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_endpgm
   %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
   ret void
 }
 
-; Make sure this artificially selects with an incorrect subtarget, but the feature set.
-; GCN-LABEL: {{^}}global_atomic_fadd_ret_f32_wrong_subtarget:
 define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(float addrspace(1)* %ptr) #1 {
+; GCN-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b64 s[2:3], 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:  BB4_1: ; %atomicrmw.start
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_wbinvl1_vol
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GCN-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GCN-NEXT:    s_cbranch_execnz BB4_1
+; GCN-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GCN-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GCN-NEXT:    global_store_dword v[0:1], v0, off
+; GCN-NEXT:    s_endpgm
   %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
   store float %result, float addrspace(1)* undef
   ret void
 }
 
-; GCN-LABEL: {{^}}global_atomic_fadd_noret_f32_wrong_subtarget:
 define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(float addrspace(1)* %ptr) #1 {
+; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_wbinvl1_vol
+; GCN-NEXT:    s_endpgm
   %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
   ret void
 }
 
 attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
-attributes #1 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-cpu"="gfx803" "target-features"="+atomic-fadd-insts" }
+attributes #1 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-cpu"="gfx803" "target-features"="+atomic-fadd-insts" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #2 = { "amdgpu-unsafe-fp-atomics"="true" }

diff  --git a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
index e047ae9ba380..5d283f6bcc5d 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
@@ -44,168 +44,168 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
     ; CHECK: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit-def $vgpr5, implicit-def $vgpr5_lo16, implicit-def $vgpr5_hi16, implicit-def $vgpr6, implicit-def $vgpr6_lo16, implicit-def $vgpr6_hi16, implicit-def $vgpr7, implicit-def $vgpr7_lo16, implicit-def $vgpr7_hi16, implicit-def $vgpr8, implicit-def $vgpr8_lo16, implicit-def $vgpr8_hi16, implicit-def $vgpr9, implicit-def $vgpr9_lo16, implicit-def $vgpr9_hi16, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit-def $vgpr17, implicit-def $vgpr17_lo16, implicit-def $vgpr17_hi16, implicit-def $vgpr18, implicit-def $vgpr18_lo16, implicit-def $vgpr18_hi16, implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr24, implicit-def $vgpr24_lo16, implicit-def $vgpr24_hi16, implicit-def $vgpr25, implicit-def $vgpr25_lo16, implicit-def $vgpr25_hi16, implicit-def $vgpr26, implicit-def $vgpr26_lo16, implicit-def $vgpr26_hi16, implicit-def $vgpr27, implicit-def $vgpr27_lo16, implicit-def $vgpr27_hi16, implicit-def $vgpr28, implicit-def $vgpr28_lo16, implicit-def $vgpr28_hi16, implicit-def $vgpr29, implicit-def $vgpr29_lo16, implicit-def $vgpr29_hi16, implicit-def $vgpr30, implicit-def $vgpr30_lo16, implicit-def $vgpr30_hi16, implicit-def $vgpr31, implicit-def $vgpr31_lo16, implicit-def $vgpr31_hi16, implicit-def $vgpr32, implicit-def $vgpr32_lo16, implicit-def $vgpr32_hi16, implicit-def $vgpr33, implicit-def $vgpr33_lo16, implicit-def $vgpr33_hi16, implicit-def $vgpr34, implicit-def $vgpr34_lo16, implicit-def $vgpr34_hi16, implicit-def $vgpr35, implicit-def $vgpr35_lo16, implicit-def $vgpr35_hi16, implicit-def $vgpr36, implicit-def $vgpr36_lo16, implicit-def $vgpr36_hi16, implicit-def $vgpr37, implicit-def $vgpr37_lo16, implicit-def $vgpr37_hi16, implicit-def $vgpr38, implicit-def $vgpr38_lo16, implicit-def $vgpr38_hi16, implicit-def $vgpr39, implicit-def $vgpr39_lo16, implicit-def $vgpr39_hi16, implicit-def $vgpr40, implicit-def $vgpr40_lo16, implicit-def $vgpr40_hi16, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
     ; CHECK:   S_CLAUSE 63
-    ; CHECK:   $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr6 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr7 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr8 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr9 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr10 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr11 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr12 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr13 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr14 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr15 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr16 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr17 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr18 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr19 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr20 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr21 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr22 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr23 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr24 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr25 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr26 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr27 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr28 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr29 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr30 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr31 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr32 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr33 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr34 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr35 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 140, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr36 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 144, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr37 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 148, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr38 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 152, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr39 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 156, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr40 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 160, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr41 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 164, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr42 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 168, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr43 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 172, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr44 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 176, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr45 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 180, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr46 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 184, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr47 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 188, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr48 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 192, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr49 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 196, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr50 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 200, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr51 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 204, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr52 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 208, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr53 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 212, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr54 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 216, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr55 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 220, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr56 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 224, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr57 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 228, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr58 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 232, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr59 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 236, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr60 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 240, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr61 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 244, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr62 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 248, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr63 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 252, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr64 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr6 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr7 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr8 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr9 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr10 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr11 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr12 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr13 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr14 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr15 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr16 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr17 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr18 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr19 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr20 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr21 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr22 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr23 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr24 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr25 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr26 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr27 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr28 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr29 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr30 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr31 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr32 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr33 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr34 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr35 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 140, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr36 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 144, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr37 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 148, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr38 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 152, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr39 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 156, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr40 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 160, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr41 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 164, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr42 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 168, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr43 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 172, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr44 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 176, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr45 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 180, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr46 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 184, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr47 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 188, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr48 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 192, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr49 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 196, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr50 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 200, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr51 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 204, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr52 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 208, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr53 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 212, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr54 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 216, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr55 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 220, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr56 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 224, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr57 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 228, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr58 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 232, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr59 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 236, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr60 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 240, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr61 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 244, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr62 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 248, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr63 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 252, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr64 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, 0, 0, 0, 0, implicit $exec
     ; CHECK: }
     ; CHECK: BUNDLE implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
     ; CHECK:   S_CLAUSE 15
-    ; CHECK:   $vgpr65 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr66 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 264, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr67 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 268, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr68 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 272, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr69 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 276, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr70 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 280, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr71 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 284, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr72 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 288, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr73 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 292, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr74 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 296, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr75 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 300, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr76 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 304, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr77 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 308, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr78 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 312, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr79 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 316, 0, 0, 0, 0, 0, implicit $exec
-    ; CHECK:   $vgpr80 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 320, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr65 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr66 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 264, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr67 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 268, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr68 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 272, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr69 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 276, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr70 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 280, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr71 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 284, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr72 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 288, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr73 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 292, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr74 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 296, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr75 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 300, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr76 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 304, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr77 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 308, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr78 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 312, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr79 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 316, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK:   $vgpr80 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 320, 0, 0, 0, 0, 0, 0, implicit $exec
     ; CHECK: }
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr6 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr7 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr8 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr9 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr10 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr11 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr12 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr13 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr14 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr15 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr16 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr17 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr18 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr19 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr20 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr21 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr22 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr23 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr24 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr25 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr26 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr27 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr28 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr29 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr30 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr31 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr32 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr33 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr34 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr35 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 140, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr36 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 144, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr37 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 148, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr38 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 152, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr39 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 156, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr40 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 160, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr41 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 164, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr42 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 168, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr43 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 172, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr44 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 176, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr45 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 180, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr46 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 184, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr47 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 188, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr48 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 192, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr49 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 196, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr50 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 200, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr51 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 204, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr52 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 208, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr53 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 212, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr54 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 216, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr55 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 220, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr56 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 224, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr57 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 228, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr58 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 232, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr59 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 236, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr60 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 240, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr61 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 244, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr62 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 248, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr63 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 252, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr64 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr65 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr66 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 264, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr67 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 268, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr68 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 272, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr69 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 276, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr70 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 280, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr71 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 284, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr72 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 288, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr73 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 292, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr74 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 296, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr75 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 300, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr76 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 304, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr77 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 308, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr78 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 312, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr79 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 316, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr80 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 320, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr6 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr7 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr8 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr9 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr10 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr11 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr12 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr13 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr14 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr15 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr16 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr17 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr18 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr19 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr20 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr21 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr22 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr23 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr24 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr25 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr26 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr27 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr28 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr29 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr30 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr31 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr32 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr33 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr34 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr35 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 140, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr36 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 144, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr37 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 148, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr38 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 152, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr39 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 156, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr40 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 160, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr41 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 164, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr42 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 168, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr43 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 172, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr44 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 176, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr45 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 180, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr46 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 184, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr47 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 188, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr48 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 192, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr49 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 196, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr50 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 200, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr51 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 204, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr52 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 208, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr53 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 212, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr54 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 216, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr55 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 220, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr56 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 224, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr57 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 228, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr58 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 232, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr59 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 236, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr60 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 240, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr61 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 244, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr62 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 248, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr63 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 252, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr64 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr65 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr66 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 264, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr67 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 268, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr68 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 272, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr69 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 276, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr70 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 280, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr71 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 284, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr72 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 288, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr73 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 292, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr74 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 296, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr75 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 300, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr76 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 304, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr77 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 308, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr78 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 312, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr79 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 316, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr80 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 320, 0, 0, 0, 0, 0, 0, implicit $exec
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir b/llvm/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir
index 3b9e94764ddf..1ac363471723 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir
@@ -12,7 +12,7 @@ body:             |
   bb.0.entry:
     liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr7, $vgpr8, $vgpr9, $vgpr10
 
-    BUFFER_STORE_DWORDX4_OFFSET_exact killed $vgpr7_vgpr8_vgpr9_vgpr10, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 96, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORDX4_OFFSET_exact killed $vgpr7_vgpr8_vgpr9_vgpr10, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 96, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr7 = V_INTERP_P1_F32 $vgpr0, 0, 0, implicit $mode, implicit $m0, implicit $exec
     S_ENDPGM 0
 

diff  --git a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir
index ef0122007f22..e36f4bd63387 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir
@@ -54,7 +54,7 @@ body: |
     BUNDLE implicit-def $sgpr0_sgpr1, implicit $sgpr10_sgpr11 {
       $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM $sgpr10_sgpr11, 0, 0, 0
     }
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -84,5 +84,5 @@ body: |
     }
 
   bb.2:
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir
index 2ad7155fd7e6..48f26c1e0b9b 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir
@@ -78,7 +78,7 @@ body: |
       $vgpr0 = IMPLICIT_DEF
       $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
       $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM $sgpr10_sgpr11, 0, 0, 0
-      $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec
+      $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     }
     S_ENDPGM 0
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/hazard-inlineasm.mir b/llvm/test/CodeGen/AMDGPU/hazard-inlineasm.mir
index 4905aee8170d..1a0bc39458cf 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-inlineasm.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-inlineasm.mir
@@ -16,7 +16,7 @@ name: hazard-inlineasm
 
 body: |
   bb.0:
-   FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+   FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
    INLINEASM &"v_mad_u64_u32 $0, $1, $2, $3, $4", 0, 2621450, def $vgpr26_vgpr27, 2818058, def dead $sgpr14_sgpr15, 589833, $sgpr12, 327689, killed $vgpr51, 2621449, $vgpr46_vgpr47
    S_ENDPGM 0
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/hazard-pass-ordering.mir b/llvm/test/CodeGen/AMDGPU/hazard-pass-ordering.mir
index 87b16f87a79b..c6d8994a090b 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-pass-ordering.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-pass-ordering.mir
@@ -15,7 +15,7 @@ body:             |
     $vgpr0 = V_MOV_B32_e32 1, implicit $exec
     $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
     $sgpr8_sgpr9 = S_MOV_B64 -1
-    $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc
     S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
 

diff  --git a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-meta-insts.mir b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-meta-insts.mir
index e59db4fead3d..97b6eba13754 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-meta-insts.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-meta-insts.mir
@@ -12,11 +12,11 @@ body:             |
   bb.0:
     liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-LABEL: name: global_store_dwordx4_data_hazard_kill
-    ; GFX9: GLOBAL_STORE_DWORDX4 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec
+    ; GFX9: GLOBAL_STORE_DWORDX4 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec
     ; GFX9: $vgpr2 = KILL
     ; GFX9: S_NOP 0
     ; GFX9: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec
     $vgpr2 = KILL
     $vgpr2 = V_MOV_B32_e32 0, implicit $exec
 
@@ -30,11 +30,11 @@ body:             |
   bb.0:
     liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX9-LABEL: name: global_store_dwordx3_data_hazard_kill
-    ; GFX9: GLOBAL_STORE_DWORDX3 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    ; GFX9: GLOBAL_STORE_DWORDX3 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
     ; GFX9: $vgpr2 = KILL
     ; GFX9: S_NOP 0
     ; GFX9: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
-    GLOBAL_STORE_DWORDX3 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX3 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
     $vgpr2 = KILL
     $vgpr2 = V_MOV_B32_e32 0, implicit $exec
 

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
index e0ad07cddba5..c74471b1b8fe 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
@@ -99,7 +99,7 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) {
   ; GCN: bb.2:
   ; GCN:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load 4 from %stack.6, addrspace 5)
   ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load 16 from %stack.2, align 4, addrspace 5)
-  ; GCN:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1)
+  ; GCN:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1)
   ; GCN:   S_ENDPGM 0
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1

diff  --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index 125c58581787..d523afa59b0a 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -7,9 +7,9 @@
 define amdgpu_kernel void @s_input_output_i128() {
   ; CHECK-LABEL: name: s_input_output_i128
   ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:SGPR_128 */, def %4
+  ; CHECK:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4128778 /* regdef:SGPR_128 */, def %4
   ; CHECK:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %4
-  ; CHECK:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:SGPR_128 */, [[COPY]]
+  ; CHECK:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4128777 /* reguse:SGPR_128 */, [[COPY]]
   ; CHECK:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=s"()
   call void asm sideeffect "; use $0", "s"(i128 %val)
@@ -19,9 +19,9 @@ define amdgpu_kernel void @s_input_output_i128() {
 define amdgpu_kernel void @v_input_output_i128() {
   ; CHECK-LABEL: name: v_input_output_i128
   ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_128 */, def %4
+  ; CHECK:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_128 */, def %4
   ; CHECK:   [[COPY:%[0-9]+]]:vreg_128 = COPY %4
-  ; CHECK:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3866633 /* reguse:VReg_128 */, [[COPY]]
+  ; CHECK:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:VReg_128 */, [[COPY]]
   ; CHECK:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=v"()
   call void asm sideeffect "; use $0", "v"(i128 %val)
@@ -31,9 +31,9 @@ define amdgpu_kernel void @v_input_output_i128() {
 define amdgpu_kernel void @a_input_output_i128() {
   ; CHECK-LABEL: name: a_input_output_i128
   ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3801098 /* regdef:AReg_128 */, def %4
+  ; CHECK:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3932170 /* regdef:AReg_128 */, def %4
   ; CHECK:   [[COPY:%[0-9]+]]:areg_128 = COPY %4
-  ; CHECK:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3801097 /* reguse:AReg_128 */, [[COPY]]
+  ; CHECK:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3932169 /* reguse:AReg_128 */, [[COPY]]
   ; CHECK:   S_ENDPGM 0
   %val = call i128 asm sideeffect "; def $0", "=a"()
   call void asm sideeffect "; use $0", "a"(i128 %val)

diff  --git a/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir
index 76bb74d3a63f..3c078d847e3c 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir
@@ -13,7 +13,7 @@ body: |
   ; CHECK: bb.1:
   ; CHECK:   successors: %bb.2(0x80000000)
   ; CHECK:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK:   FLAT_STORE_DWORD undef $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+  ; CHECK:   FLAT_STORE_DWORD undef $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
   ; CHECK: bb.2:
   ; CHECK:   S_ENDPGM 0
   bb.0:
@@ -23,7 +23,7 @@ body: |
   bb.1:
     successors: %bb.2
     $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-    FLAT_STORE_DWORD undef $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD undef $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
   bb.2:
     S_ENDPGM 0
@@ -41,7 +41,7 @@ body: |
   ; CHECK: bb.1:
   ; CHECK:   successors: %bb.2(0x80000000)
   ; CHECK:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+  ; CHECK:   BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
   ; CHECK: bb.2:
   ; CHECK:   S_ENDPGM 0
   bb.0:
@@ -51,7 +51,7 @@ body: |
   bb.1:
     successors: %bb.2
     $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-    BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2:
     S_ENDPGM 0

diff  --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-exp.mir b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-exp.mir
index 5797bb5cfa29..8d000e6da80a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-exp.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-exp.mir
@@ -49,10 +49,10 @@ body:             |
   bb.0 (%ir-block.2):
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
-    $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
-    $vgpr3 = BUFFER_LOAD_DWORD_OFFSET killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    $vgpr3 = BUFFER_LOAD_DWORD_OFFSET killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
     EXP_DONE 0, killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3, -1, -1, 15, implicit $exec
     $vgpr0 = V_MOV_B32_e32 1056964608, implicit $exec
     $vgpr1 = V_MOV_B32_e32 1065353216, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir
index 0e6fa25dc02c..03e1d12bd451 100644
--- a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir
+++ b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir
@@ -190,32 +190,32 @@ name: vmem_gt_8dw_store
 
 body: |
   bb.0:
-    BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr3 = V_MOV_B32_e32 0, implicit $exec
-    BUFFER_STORE_DWORDX3_OFFSET $vgpr2_vgpr3_vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORDX3_OFFSET $vgpr2_vgpr3_vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr3 = V_MOV_B32_e32 0, implicit $exec
-    BUFFER_STORE_DWORDX4_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORDX4_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr3 = V_MOV_B32_e32 0, implicit $exec
-    BUFFER_STORE_DWORDX4_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORDX4_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr3 = V_MOV_B32_e32 0, implicit $exec
-    BUFFER_STORE_FORMAT_XYZ_OFFSET $vgpr2_vgpr3_vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_FORMAT_XYZ_OFFSET $vgpr2_vgpr3_vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr3 = V_MOV_B32_e32 0, implicit $exec
-    BUFFER_STORE_FORMAT_XYZW_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_FORMAT_XYZW_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr3 = V_MOV_B32_e32 0, implicit $exec
     BUFFER_ATOMIC_CMPSWAP_X2_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec
     $vgpr3 = V_MOV_B32_e32 0, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
-    FLAT_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     $vgpr3 = V_MOV_B32_e32 0, implicit $exec
-    FLAT_STORE_DWORDX3 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORDX3 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     $vgpr3 = V_MOV_B32_e32 0, implicit $exec
-    FLAT_STORE_DWORDX4 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORDX4 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     $vgpr3 = V_MOV_B32_e32 0, implicit $exec
-    FLAT_ATOMIC_CMPSWAP_X2 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_ATOMIC_CMPSWAP_X2 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, implicit $exec, implicit $flat_scr
     $vgpr3 = V_MOV_B32_e32 0, implicit $exec
-    FLAT_ATOMIC_FCMPSWAP_X2 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_ATOMIC_FCMPSWAP_X2 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, implicit $exec, implicit $flat_scr
     $vgpr3 = V_MOV_B32_e32 0, implicit $exec
     S_ENDPGM 0
 

diff  --git a/llvm/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir b/llvm/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir
index 0a60eaf7c03f..693851023f7c 100644
--- a/llvm/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir
+++ b/llvm/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir
@@ -64,7 +64,7 @@ body:             |
     liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
 
     $vgpr0 = V_MOV_B32_e32 100, implicit $exec
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`)
     $vgpr0 = V_MOV_B32_e32 1, implicit $exec
     S_BRANCH %bb.3
 
@@ -72,7 +72,7 @@ body:             |
     liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
 
     $vgpr0 = V_MOV_B32_e32 9, implicit $exec
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`)
     $vgpr0 = V_MOV_B32_e32 0, implicit $exec
 
   bb.3.done:
@@ -80,7 +80,7 @@ body:             |
 
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out)
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out)
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir b/llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir
index 566b1c06fb12..2df74fed26ee 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir
@@ -12,7 +12,7 @@ body:            |
     S_BRANCH %bb.1
 
   bb.1:
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -24,7 +24,7 @@ name:            hazard_buf_branch_lds
 body:            |
   bb.0:
     successors: %bb.1
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
@@ -56,11 +56,11 @@ name:            no_hazard_buf_branch_buf
 body:            |
   bb.0:
     successors: %bb.1
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -75,7 +75,7 @@ body:            |
     $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec
 
   bb.1:
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -87,7 +87,7 @@ name:            no_hazard_lds_branch_buf_samebb
 body:            |
   bb.0:
     $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -101,7 +101,7 @@ body:            |
   bb.0:
     successors: %bb.0
     $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_BRANCH %bb.0
 ...
 
@@ -118,8 +118,8 @@ body:            |
     S_BRANCH %bb.1
 
   bb.1:
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -137,7 +137,7 @@ body:            |
 
   bb.1:
     $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -150,11 +150,11 @@ body:            |
   bb.0:
     successors: %bb.1
     $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -171,7 +171,7 @@ body:            |
 
   bb.1:
     S_WAITCNT_VSCNT undef $sgpr_null, 1
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -189,7 +189,7 @@ body:            |
 
   bb.1:
     S_WAITCNT_VSCNT undef $sgpr_null, 0
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -206,7 +206,7 @@ body:            |
 
   bb.1:
     S_WAITCNT_VSCNT undef $sgpr0, 0
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -223,7 +223,7 @@ body:            |
     S_BRANCH %bb.1
 
   bb.1:
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -239,7 +239,7 @@ body:            |
     S_BRANCH %bb.1
 
   bb.1:
-    $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -255,7 +255,7 @@ body:            |
     S_BRANCH %bb.1
 
   bb.1:
-    $vgpr1 = SCRATCH_LOAD_DWORD undef $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr1 = SCRATCH_LOAD_DWORD undef $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 
@@ -271,6 +271,6 @@ body:            |
     S_BRANCH %bb.1
 
   bb.1:
-    $vgpr1 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr1 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
index fbf59da6c2fa..55f6ca6bbc96 100644
--- a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
+++ b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
@@ -57,15 +57,15 @@ body:             |
     %4.sub1 = COPY %3.sub0
     undef %5.sub0 = COPY %4.sub1
     %5.sub1 = COPY %4.sub0
-    FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %6 = IMPLICIT_DEF
     undef %7.sub0_sub1 = COPY %6
     %7.sub2 = COPY %3.sub0
-    FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
     %8 = IMPLICIT_DEF
     undef %9.sub0_sub1_sub2 = COPY %8
     %9.sub3 = COPY %3.sub0
-    FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll
new file mode 100644
index 000000000000..134e6d973585
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll
@@ -0,0 +1,93 @@
+; RUN: llc < %s -march=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A
+; RUN: not --crash llc < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908
+
+declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1)
+declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1)
+declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)*, float)
+declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)*, <2 x half>)
+
+; GFX908: error: {{.*}} return versions of fp atomics not supported
+
+; GFX90A-LABEL: {{^}}buffer_atomic_add_f32:
+; GFX90A: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen glc
+define amdgpu_ps float @buffer_atomic_add_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
+main_body:
+  %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+  ret float %ret
+}
+
+; GFX90A-LABEL: {{^}}buffer_atomic_add_f32_off4_slc:
+; GFX90A: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen offset:4 glc slc
+define amdgpu_ps float @buffer_atomic_add_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
+main_body:
+  %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
+  ret float %ret
+}
+
+; GFX90A-LABEL: {{^}}buffer_atomic_pk_add_v2f16:
+; GFX90A: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen glc
+define amdgpu_ps <2 x half> @buffer_atomic_pk_add_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) {
+main_body:
+  %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+  ret <2 x half> %ret
+}
+
+; GFX90A-LABEL: {{^}}buffer_atomic_pk_add_v2f16_off4_slc:
+; GFX90A: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen offset:4 glc slc
+define amdgpu_ps <2 x half> @buffer_atomic_pk_add_v2f16_off4_slc(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) {
+main_body:
+  %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
+  ret <2 x half> %ret
+}
+
+; GFX90A-LABEL: {{^}}global_atomic_add_f32:
+; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc
+define amdgpu_ps float @global_atomic_add_f32(float addrspace(1)* %ptr, float %data) {
+main_body:
+  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
+  ret float %ret
+}
+
+; GFX90A-LABEL: {{^}}global_atomic_add_f32_off4:
+; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off offset:4 glc
+define amdgpu_ps float @global_atomic_add_f32_off4(float addrspace(1)* %ptr, float %data) {
+main_body:
+  %p = getelementptr float, float addrspace(1)* %ptr, i64 1
+  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %p, float %data)
+  ret float %ret
+}
+
+; GFX90A-LABEL: {{^}}global_atomic_add_f32_offneg4:
+; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off offset:-4 glc
+define amdgpu_ps float @global_atomic_add_f32_offneg4(float addrspace(1)* %ptr, float %data) {
+main_body:
+  %p = getelementptr float, float addrspace(1)* %ptr, i64 -1
+  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %p, float %data)
+  ret float %ret
+}
+
+; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16:
+; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc
+define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
+main_body:
+  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
+  ret <2 x half> %ret
+}
+
+; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16_off4:
+; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:4 glc
+define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16_off4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
+main_body:
+  %p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 1
+  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data)
+  ret <2 x half> %ret
+}
+
+; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16_offneg4:
+; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-4 glc
+define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16_offneg4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
+main_body:
+  %p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -1
+  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data)
+  ret <2 x half> %ret
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll
index 19322c1b6481..e1bdb747c04a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs | FileCheck %s -check-prefix=GCN
+; RUN: llc < %s -march=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GCN
 
 declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1)
 declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
index 53809d32b087..ad4d0a779cd0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti  -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX6 %s
 ; RUN: llc -march=amdgcn -mcpu=tonga   -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX8 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900  -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx90a  -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX9 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX101 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOMADMACF32,GFX103 %s
 

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll
index c219660bf79b..40bd98fae3e6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX6789 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX6789 %s
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=GFX10 %s
 
 ; GCN-LABEL: {{^}}atomic_swap_1d:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
new file mode 100644
index 000000000000..6a21a91b4f6e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
@@ -0,0 +1,306 @@
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+
+; GCN-LABEL: {{^}}load_1d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm{{$}}
+define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_1d_lwe:
+; GCN: image_load v[0:4], v{{[0-9]+}}, s[0:7] dmask:0xf unorm lwe{{$}}
+define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+main_body:
+  %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 2, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
+; GCN-LABEL: {{^}}load_2d:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}}
+define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_3d:
+; GCN: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm{{$}}
+define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_cube:
+; GCN: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm da{{$}}
+define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_cube_lwe:
+; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}}
+define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
+; GCN-LABEL: {{^}}load_1darray:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm da{{$}}
+define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_2darray:
+; GCN: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm da{{$}}
+define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_2darray_lwe:
+; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}}
+define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
+; GCN-LABEL: {{^}}load_2dmsaa:
+; GCN: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm{{$}}
+define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_2darraymsaa:
+; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
+define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}store_1d:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm{{$}}
+define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) {
+main_body:
+  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_2d:
+; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm{{$}}
+define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t) {
+main_body:
+  call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_3d:
+; GCN: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm{{$}}
+define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %r) {
+main_body:
+  call void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_cube:
+; GCN: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm da{{$}}
+define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  call void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_1darray:
+; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm da{{$}}
+define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %slice) {
+main_body:
+  call void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_2darray:
+; GCN: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm da{{$}}
+define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  call void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_2dmsaa:
+; GCN: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm{{$}}
+define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %fragid) {
+main_body:
+  call void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_2darraymsaa:
+; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}}
+define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+main_body:
+  call void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}load_1d_V1:
+; GCN: image_load v0, v0, s[0:7] dmask:0x8 unorm{{$}}
+define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret float %v
+}
+
+; GCN-LABEL: {{^}}load_1d_V2:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm{{$}}
+define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 9, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <2 x float> %v
+}
+
+; GCN-LABEL: {{^}}store_1d_V1:
+; GCN: image_store v0, v1, s[0:7] dmask:0x2 unorm{{$}}
+define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, i32 %s) {
+main_body:
+  call void @llvm.amdgcn.image.store.1d.f32.i32(float %vdata, i32 2, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_1d_V2:
+; GCN: image_store v[0:1], v2, s[0:7] dmask:0xc unorm{{$}}
+define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, i32 %s) {
+main_body:
+  call void @llvm.amdgcn.image.store.1d.v2f32.i32(<2 x float> %vdata, i32 12, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}load_1d_glc:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc{{$}}
+define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_1d_slc:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm slc{{$}}
+define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_1d_glc_slc:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc slc{{$}}
+define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}store_1d_glc:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc{{$}}
+define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) {
+main_body:
+  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_1d_slc:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm slc{{$}}
+define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) {
+main_body:
+  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_1d_glc_slc:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc slc{{$}}
+define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) {
+main_body:
+  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3)
+  ret void
+}
+
+; GCN-LABEL: {{^}}image_store_wait:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf
+; SI: s_waitcnt expcnt(0)
+; GCN: image_load v[0:3], v4, s[8:15] dmask:0xf
+; GCN: s_waitcnt vmcnt(0)
+; GCN: image_store v[0:3], v4, s[16:23] dmask:0xf
+define amdgpu_ps void @image_store_wait(<8 x i32> inreg %arg, <8 x i32> inreg %arg1, <8 x i32> inreg %arg2, <4 x float> %arg3, i32 %arg4) #0 {
+main_body:
+  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %arg3, i32 15, i32 %arg4, <8 x i32> %arg, i32 0, i32 0)
+  %data = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %arg4, <8 x i32> %arg1, i32 0, i32 0)
+  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %data, i32 15, i32 %arg4, <8 x i32> %arg2, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: image_load_mmo
+; GCN: image_load v1, v[4:5], s[0:7] dmask:0x1 unorm
+define amdgpu_ps float @image_load_mmo(<8 x i32> inreg %rsrc, float addrspace(3)* %lds, <2 x i32> %c) #0 {
+  store float 0.000000e+00, float addrspace(3)* %lds
+  %c0 = extractelement <2 x i32> %c, i32 0
+  %c1 = extractelement <2 x i32> %c, i32 1
+  %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0)
+  %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
+  store float 0.000000e+00, float addrspace(3)* %tmp2
+  ret float %tex
+}
+
+declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare {float,i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+
+declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+
+declare float @llvm.amdgcn.image.load.1d.f32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare float @llvm.amdgcn.image.load.2d.f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare void @llvm.amdgcn.image.store.1d.f32.i32(float, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.1d.v2f32.i32(<2 x float>, i32, i32, <8 x i32>, i32, i32) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readnone }

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.gfx90a.ll
new file mode 100644
index 000000000000..dead4ecbaae7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.gfx90a.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A %s
+
+; GFX90A-LABEL: {{^}}sample_1d:
+; GFX90A-NOT: s_wqm_b64
+; GFX90A:     image_sample v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], s[{{[0-9:]+}}] dmask:0xf
+define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GFX90A-LABEL: {{^}}sample_1d_lwe:
+; GFX90A-NOT: s_wqm_b64
+; GFX90A:     image_sample v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], s[{{[0-9:]+}}] dmask:0xf lwe
+define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 2, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
+; GFX90A-LABEL: {{^}}sample_2d:
+; GFX90A-NOT: s_wqm_b64
+; GFX90A:     image_sample v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] dmask:0xf
+define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GFX90A-LABEL: {{^}}sample_3d:
+; GFX90A-NOT: s_wqm_b64
+; GFX90A:     image_sample v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] dmask:0xf
+define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %r) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GFX90A-LABEL: {{^}}sample_cube:
+; GFX90A-NOT: s_wqm_b64
+; GFX90A:     image_sample v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] dmask:0xf da
+define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %face) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float %s, float %t, float %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GFX90A-LABEL: {{^}}sample_1darray:
+; GFX90A-NOT: s_wqm_b64
+; GFX90A:     image_sample v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] dmask:0xf da
+define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %slice) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float %s, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GFX90A-LABEL: {{^}}sample_1d_unorm:
+; GFX90A-NOT: s_wqm_b64
+; GFX90A:     image_sample v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], s[{{[0-9:]+}}] dmask:0xf unorm
+define amdgpu_ps <4 x float> @sample_1d_unorm(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 1, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32)
+declare {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32)
+declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)
+declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)
+declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)
+declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
new file mode 100644
index 000000000000..b04e73a93cbf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
@@ -0,0 +1,142 @@
+; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
+
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+; GCN-LABEL: {{^}}test_mfma_f32_32x32x2bf16:
+; GCN-DAG:         v_mov_b32_e32 [[TWO:v[0-9]+]], 2
+; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; GCN-DAG:         s_load_dwordx16
+; GCN-DAG:         s_load_dwordx16
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN:             v_mfma_f32_32x32x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX908-COUNT-32: v_accvgpr_read_b32
+; GFX908:          global_store_dwordx4
+; GFX90A-NOT:      v_accvgpr_read_b32
+; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}],
+define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(<32 x float> addrspace(1)* %arg) {
+bb:
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %a = bitcast i32 1 to <2 x i16>
+  %b = bitcast i32 2 to <2 x i16>
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_16x16x2bf16:
+; GCN-DAG:         v_mov_b32_e32 [[TWO:v[0-9]+]], 2
+; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; GCN:             s_load_dwordx16
+; GCN-COUNT-16:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN:             v_mfma_f32_16x16x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX908-COUNT-16: v_accvgpr_read_b32
+; GFX908:          global_store_dwordx4
+; GFX90A-NOT:      v_accvgpr_read_b32
+; GFX90A-COUNT-4:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}],
+define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(<16 x float> addrspace(1)* %arg) {
+bb:
+  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %a = bitcast i32 1 to <2 x i16>
+  %b = bitcast i32 2 to <2 x i16>
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
+  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_4x4x2bf16:
+; GCN-DAG:        v_mov_b32_e32 [[TWO:v[0-9]+]], 2
+; GCN-DAG:        v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; GCN:            s_load_dwordx4
+; GCN-COUNT-4:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN:            v_mfma_f32_4x4x2bf16 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX908-COUNT-4: v_accvgpr_read_b32
+; GFX908:         global_store_dwordx4
+; GFX90A-NOT:     v_accvgpr_read_b32
+; GFX90A:         global_store_dwordx4 v{{[0-9]+}}, [[RES]],
+define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(<4 x float> addrspace(1)* %arg) {
+bb:
+  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %a = bitcast i32 1 to <2 x i16>
+  %b = bitcast i32 2 to <2 x i16>
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
+  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16:
+; GCN-DAG:         v_mov_b32_e32 [[TWO:v[0-9]+]], 2
+; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; GCN:             s_load_dwordx16
+; GCN-COUNT-16:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN:             v_mfma_f32_32x32x4bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX908-COUNT-16: v_accvgpr_read_b32
+; GFX908:          global_store_dwordx4
+; GFX90A-NOT:      v_accvgpr_read_b32
+; GFX90A-COUNT-4:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}],
+define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(<16 x float> addrspace(1)* %arg) {
+bb:
+  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %a = bitcast i32 1 to <2 x i16>
+  %b = bitcast i32 2 to <2 x i16>
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
+  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_16x16x8bf16:
+; GCN-DAG:        v_mov_b32_e32 [[TWO:v[0-9]+]], 2
+; GCN-DAG:        v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; GCN:            s_load_dwordx4
+; GCN-COUNT-4:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN:            v_mfma_f32_16x16x8bf16 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX908-COUNT-4: v_accvgpr_read_b32
+; GFX908:         global_store_dwordx4
+; GFX90A-NOT:     v_accvgpr_read_b32
+; GFX90A:         global_store_dwordx4 v{{[0-9]+}}, [[RES]],
+define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(<4 x float> addrspace(1)* %arg) {
+bb:
+  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %a = bitcast i32 1 to <2 x i16>
+  %b = bitcast i32 2 to <2 x i16>
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
+  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
new file mode 100644
index 000000000000..a7b2e9879d9d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -0,0 +1,194 @@
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
+
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
+declare <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double, double, <4 x double>, i32, i32, i32)
+declare double @llvm.amdgcn.mfma.f64.4x4x4f64(double, double, double, i32, i32, i32)
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16_1k:
+; GCN-DAG:     s_load_dwordx16
+; GCN-DAG:     s_load_dwordx16
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-DAG:  v_mov_b32_e32 v[[TWO:[0-9]+]], 2
+; GFX90A-DAG:  v_mov_b32_e32 v[[ONE:[0-9]+]], 1
+; GFX90A:      v_mfma_f32_32x32x4bf16_1k a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9+]}}], v{{\[}}[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GCN-NOT:     v_accvgpr_read_b32
+; GCN-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
+define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(<32 x float> addrspace(1)* %arg) {
+bb:
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %a = bitcast i64 1 to <4 x i16>
+  %b = bitcast i64 2 to <4 x i16>
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_16x16x4bf16_1k:
+; GCN-DAG:         s_load_dwordx16
+; GCN-DAG:         v_mov_b32_e32 v[[TWO:[0-9]+]], 2
+; GCN-DAG:         v_mov_b32_e32 v[[ONE:[0-9]+]], 1
+; GFX90A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A:          v_mfma_f32_16x16x4bf16_1k a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9+]}}], v{{\[}}[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GCN-NOT:         v_accvgpr_read_b32
+; GCN-COUNT-4:     global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
+define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(<16 x float> addrspace(1)* %arg) {
+bb:
+  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %a = bitcast i64 1 to <4 x i16>
+  %b = bitcast i64 2 to <4 x i16>
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
+  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_4x4x4bf16_1k:
+; GCN-DAG:        s_load_dwordx4
+; GCN-DAG:        v_mov_b32_e32 v[[TWO:[0-9]+]], 2
+; GCN-DAG:        v_mov_b32_e32 v[[ONE:[0-9]+]], 1
+; GFX90A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A:         v_mfma_f32_4x4x4bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v{{\[}}[[ONE]]:{{[0-9+]}}], v{{\[}}[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GCN-NOT:        v_accvgpr_read_b32
+; GCN:            global_store_dwordx4 v{{[0-9]+}}, [[RES]],
+define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(<4 x float> addrspace(1)* %arg) {
+bb:
+  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %a = bitcast i64 1 to <4 x i16>
+  %b = bitcast i64 2 to <4 x i16>
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
+  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_32x32x8bf16_1k:
+; GCN-DAG:         s_load_dwordx16
+; GCN-DAG:         v_mov_b32_e32 v[[TWO:[0-9]+]], 2
+; GCN-DAG:         v_mov_b32_e32 v[[ONE:[0-9]+]], 1
+; GFX90A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A:          v_mfma_f32_32x32x8bf16_1k a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9+]}}], v{{\[}}[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GCN-NOT:         v_accvgpr_read_b32
+; GCN-COUNT-4:     global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
+define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(<16 x float> addrspace(1)* %arg) {
+bb:
+  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %a = bitcast i64 1 to <4 x i16>
+  %b = bitcast i64 2 to <4 x i16>
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
+  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_16x16x16bf16_1k:
+; GCN-DAG:        s_load_dwordx4
+; GCN-DAG:        v_mov_b32_e32 v[[TWO:[0-9]+]], 2
+; GCN-DAG:        v_mov_b32_e32 v[[ONE:[0-9]+]], 1
+; GFX90A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A:         v_mfma_f32_16x16x16bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v{{\[}}[[ONE]]:{{[0-9+]}}], v{{\[}}[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GCN-NOT:        v_accvgpr_read_b32
+; GCN:            global_store_dwordx4 v{{[0-9]+}}, [[RES]],
+define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(<4 x float> addrspace(1)* %arg) {
+bb:
+  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %a = bitcast i64 1 to <4 x i16>
+  %b = bitcast i64 2 to <4 x i16>
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
+  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f64_4x4x4f64:
+; GFX90A: v_mfma_f64_4x4x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}}
+; GFX90A: v_mfma_f64_4x4x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
+; GCN:    global_store_dwordx2
+define amdgpu_kernel void @test_mfma_f64_4x4x4f64(double addrspace(1)* %arg, double %a, double %b) {
+bb:
+  %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0)
+  %mai.2 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %mai.1, i32 1, i32 2, i32 3)
+  store double %mai.2, double addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64:
+; GCN:    s_load_dwordx8
+; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GCN:    global_store_dwordx4
+; GCN:    global_store_dwordx4
+define amdgpu_kernel void @test_mfma_f64_16x16x4f64(<4 x double> addrspace(1)* %arg, double %a, double %b) {
+bb:
+  %in.1 = load <4 x double>, <4 x double> addrspace(1)* %arg
+  %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %in.1, i32 1, i32 2, i32 3)
+  store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm:
+; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}}
+; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
+; GCN:    global_store_dwordx4
+; GCN:    global_store_dwordx4
+define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(<4 x double> addrspace(1)* %arg, double %a, double %b) {
+bb:
+  %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 0.0>, i32 0, i32 0, i32 0)
+  %mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
+  store <4 x double> %mai.2, <4 x double> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_imm:
+; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}}
+; GCN:    global_store_dwordx4
+; GCN:    global_store_dwordx4
+define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(<4 x double> addrspace(1)* %arg, double %a, double %b) {
+bb:
+  %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
+  store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_lit:
+; GCN-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX90A-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x405ec000
+; GFX90A:     v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}}
+; GCN:        global_store_dwordx4
+; GCN:        global_store_dwordx4
+define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(<4 x double> addrspace(1)* %arg, double %a, double %b) {
+bb:
+  %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)
+  store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
new file mode 100644
index 000000000000..b3213dce2b59
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
+
+declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32)
+declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32, i32)
+
+; GCN-LABEL: {{^}}test_mfma_i32_32x32x8i8:
+; GCN-DAG:         v_mov_b32_e32 [[TWO:v[0-9]+]], 2
+; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; GCN:             s_load_dwordx16
+; GCN-COUNT-16:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN:             v_mfma_i32_32x32x8i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX908-COUNT-16: v_accvgpr_read_b32
+; GFX908:          global_store_dwordx4
+; GFX90A-NOT:      v_accvgpr_read_b32
+; GFX90A-COUNT-4:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
+define amdgpu_kernel void @test_mfma_i32_32x32x8i8(<16 x i32> addrspace(1)* %arg) {
+bb:
+  %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg
+  %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3)
+  store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_i32_16x16x16i8:
+; GCN-DAG:        v_mov_b32_e32 [[TWO:v[0-9]+]], 2
+; GCN-DAG:        v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; GCN:            s_load_dwordx4
+; GCN-COUNT-4:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN:            v_mfma_i32_16x16x16i8 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX908-COUNT-4: v_accvgpr_read_b32
+; GFX908:         global_store_dwordx4
+; GFX90A-NOT:     v_accvgpr_read_b32
+; GFX90A:         global_store_dwordx4 v{{[0-9]+}}, [[RES]]
+define amdgpu_kernel void @test_mfma_i32_16x16x16i8(<4 x i32> addrspace(1)* %arg) {
+bb:
+  %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg
+  %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3)
+  store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index ddb3f3f3c42c..35f7c1859d2a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,NOLIT-SRCC %s
-; RUN: llc -march=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC %s
+; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,NOLIT-SRCC,GFX908,GFX908_A %s
+; RUN: llc -march=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC,GFX908,GFX908_A %s
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A,GFX908_A %s
 
 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
@@ -14,93 +15,56 @@ declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4
 declare <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32, i32, <32 x i32>, i32, i32, i32)
 declare <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32, i32, <16 x i32>, i32, i32, i32)
 declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
-declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32)
-declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32, i32)
-declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32)
-declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
-declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
-declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
-declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
 declare i32 @llvm.amdgcn.workitem.id.x()
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
-; GCN-DAG: s_load_dwordx16
-; GCN-DAG: s_load_dwordx16
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
+; GCN-DAG:        v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
+; GCN-DAG:        v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
+; GCN-DAG:        s_load_dwordx16
+; GCN-DAG:        s_load_dwordx16
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A:       v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX908-COUNT-4: v_accvgpr_read_b32
+; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
+; GFX908-COUNT-4: v_accvgpr_read_b32
+; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
+; GFX908-COUNT-4: v_accvgpr_read_b32
+; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
+; GFX908-COUNT-4: v_accvgpr_read_b32
+; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
+; GFX90A-NOT:     v_accvgpr_read_b32
+; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x float> addrspace(1)* %arg) {
 bb:
   %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
@@ -110,46 +74,15 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
-; GCN: s_load_dwordx16
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
+; GCN-DAG:           v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
+; GCN-DAG:           v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
+; GCN:               s_load_dwordx16
+; GFX908_A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A:          v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX908-COUNT:      v_accvgpr_read_b32
+; GFX908-COUNT-4:    global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
+; GFX90A-NOT:        v_accvgpr_read_b32
+; GFX90A-COUNT-4:    global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_16x16x1f32(<16 x float> addrspace(1)* %arg) {
 bb:
   %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
@@ -159,19 +92,15 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
-; GCN: s_load_dwordx4
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN: v_accvgpr_read_b32
-; GCN: v_accvgpr_read_b32
-; GCN: v_accvgpr_read_b32
-; GCN: v_accvgpr_read_b32
-; GCN: global_store_dwordx4
+; GCN-DAG:          v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
+; GCN-DAG:          v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
+; GCN:              s_load_dwordx4
+; GFX908_A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A:         v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX908-COUNT-4:   v_accvgpr_read_b32
+; GFX908:           global_store_dwordx4
+; GFX90A-NOT:       v_accvgpr_read_b32
+; GFX90A:           global_store_dwordx4 {{v[0-9]+}}, [[RES]]
 define amdgpu_kernel void @test_mfma_f32_4x4x1f32(<4 x float> addrspace(1)* %arg) {
 bb:
   %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
@@ -181,46 +110,15 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x2f32:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
-; GCN: s_load_dwordx16
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mfma_f32_32x32x2f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
+; GCN-DAG:           v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
+; GCN-DAG:           v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
+; GCN:               s_load_dwordx16
+; GFX908_A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A:          v_mfma_f32_32x32x2f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX908-COUNT-16:   v_accvgpr_read_b32
+; GFX908-COUNT-4:    global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
+; GFX90A-NOT:        v_accvgpr_read_b32
+; GFX90A-COUNT-4:    global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_32x32x2f32(<16 x float> addrspace(1)* %arg) {
 bb:
   %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
@@ -230,19 +128,15 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f32:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
-; GCN: s_load_dwordx4
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mfma_f32_16x16x4f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
+; GCN-DAG:          v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
+; GCN-DAG:          v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
+; GCN:              s_load_dwordx4
+; GFX908_A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A:         v_mfma_f32_16x16x4f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX908-COUNT-4:   v_accvgpr_read_b32
+; GFX908:           global_store_dwordx4
+; GFX90A-NOT:       v_accvgpr_read_b32
+; GFX90A:           global_store_dwordx4 {{v[0-9]+}}, [[RES]],
 define amdgpu_kernel void @test_mfma_f32_16x16x4f32(<4 x float> addrspace(1)* %arg) {
 bb:
   %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
@@ -252,81 +146,14 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4f16:
-; GCN-DAG: s_load_dwordx16
-; GCN-DAG: s_load_dwordx16
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mfma_f32_32x32x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
+; GCN-DAG:           s_load_dwordx16
+; GCN-DAG:           s_load_dwordx16
+; GFX908_A-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A:          v_mfma_f32_32x32x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX908-COUNT-32:   v_accvgpr_read_b32
+; GFX908:            global_store_dwordx4
+; GFX90A-NOT:        v_accvgpr_read_b32
+; GFX90A-COUNT-8:    global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
 define amdgpu_kernel void @test_mfma_f32_32x32x4f16(<32 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) {
 bb:
   %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
@@ -339,44 +166,13 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f16:
-; GCN: s_load_dwordx16
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mfma_f32_16x16x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
+; GCN:               s_load_dwordx16
+; GFX908_A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A:          v_mfma_f32_16x16x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX908-COUNT-16:   v_accvgpr_read_b32
+; GFX908:            global_store_dwordx4
+; GFX90A-NOT:        v_accvgpr_read_b32
+; GFX90A-COUNT-4:    global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
 define amdgpu_kernel void @test_mfma_f32_16x16x4f16(<16 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) {
 bb:
   %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
@@ -389,19 +185,15 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x4f16:
-; GCN: s_load_dwordx4
-; GCN: s_load_dwordx2
-; GCN: s_load_dwordx2
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mfma_f32_4x4x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
+; GCN:              s_load_dwordx4
+; GCN:              s_load_dwordx2
+; GCN:              s_load_dwordx2
+; GFX908_A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A:         v_mfma_f32_4x4x4f16 [[RES:a\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX908-COUNT-4:   v_accvgpr_read_b32
+; GFX908:           global_store_dwordx4
+; GFX90A-NOT:       v_accvgpr_read_b32
+; GFX90A:           global_store_dwordx4 {{v[0-9]+}}, [[RES]],
 define amdgpu_kernel void @test_mfma_f32_4x4x4f16(<4 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) {
 bb:
   %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
@@ -414,46 +206,15 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16:
-; GCN: s_load_dwordx16
-; GCN: s_waitcnt lgkmcnt(0)
-; GCN: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
+; GCN:               s_load_dwordx16
+; GCN:               s_waitcnt lgkmcnt(0)
+; GFX908_A:          v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; GFX908_A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A:          v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX908-COUNT-16:   v_accvgpr_read_b32
+; GFX908:            global_store_dwordx4
+; GFX90A-NOT:        v_accvgpr_read_b32
+; GFX90A-COUNT-4:    global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
 define amdgpu_kernel void @test_mfma_f32_32x32x8f16(<16 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) {
 bb:
   %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
@@ -466,18 +227,14 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x16f16:
-; GCN: s_load_dwordx4
-; GCN: s_load_dwordx4
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mfma_f32_16x16x16f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
+; GCN:              s_load_dwordx4
+; GCN:              s_load_dwordx4
+; GFX908_A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A:         v_mfma_f32_16x16x16f16 [[RES:a\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX908-COUNT-4:   v_accvgpr_read_b32
+; GFX908:           global_store_dwordx4
+; GFX90A-NOT:       v_accvgpr_read_b32
+; GFX90A:           global_store_dwordx4 {{v[0-9]+}}, [[RES]],
 define amdgpu_kernel void @test_mfma_f32_16x16x16f16(<4 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) {
 bb:
   %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
@@ -490,83 +247,47 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_i32_32x32x4i8:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN-DAG: s_load_dwordx16
-; GCN-DAG: s_load_dwordx16
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mfma_i32_32x32x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
+; GCN-DAG:         v_mov_b32_e32 [[TWO:v[0-9]+]], 2
+; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; GCN-DAG:         s_load_dwordx16
+; GCN-DAG:         s_load_dwordx16
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A:        v_mfma_i32_32x32x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX908-COUNT-32: v_accvgpr_read_b32
+; GFX908:          global_store_dwordx4
+; GFX90A-NOT:      v_accvgpr_read_b32
+; GFX90A-COUNT-8:  global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
 define amdgpu_kernel void @test_mfma_i32_32x32x4i8(<32 x i32> addrspace(1)* %arg) {
 bb:
   %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
@@ -576,46 +297,15 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_i32_16x16x4i8:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN: s_load_dwordx16
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mfma_i32_16x16x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
+; GCN-DAG:           v_mov_b32_e32 [[TWO:v[0-9]+]], 2
+; GCN-DAG:           v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; GCN:               s_load_dwordx16
+; GFX908_A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A:          v_mfma_i32_16x16x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX908-COUNT-16:   v_accvgpr_read_b32
+; GFX908:            global_store_dwordx4
+; GFX90A-NOT:        v_accvgpr_read_b32
+; GFX90A-COUNT-4:    global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
 define amdgpu_kernel void @test_mfma_i32_16x16x4i8(<16 x i32> addrspace(1)* %arg) {
 bb:
   %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg
@@ -625,19 +315,15 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_i32_4x4x4i8:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN: s_load_dwordx4
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mfma_i32_4x4x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN: v_accvgpr_read_b32
-; GCN: v_accvgpr_read_b32
-; GCN: v_accvgpr_read_b32
-; GCN: v_accvgpr_read_b32
-; GCN: global_store_dwordx4
+; GCN-DAG:          v_mov_b32_e32 [[TWO:v[0-9]+]], 2
+; GCN-DAG:          v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; GCN:              s_load_dwordx4
+; GFX908_A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908_A:         v_mfma_i32_4x4x4i8 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX908-COUNT-4:   v_accvgpr_read_b32
+; GFX908:           global_store_dwordx4
+; GFX90A-NOT:       v_accvgpr_read_b32
+; GFX90A:           global_store_dwordx4 {{v[0-9]+}}, [[RES]],
 define amdgpu_kernel void @test_mfma_i32_4x4x4i8(<4 x i32> addrspace(1)* %arg) {
 bb:
   %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg
@@ -646,318 +332,9 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_i32_32x32x8i8:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN: s_load_dwordx16
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mfma_i32_32x32x8i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_i32_32x32x8i8(<16 x i32> addrspace(1)* %arg) {
-bb:
-  %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg
-  %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3)
-  store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_mfma_i32_16x16x16i8:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN: s_load_dwordx4
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mfma_i32_16x16x16i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_i32_16x16x16i8(<4 x i32> addrspace(1)* %arg) {
-bb:
-  %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg
-  %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3)
-  store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x2bf16:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN-DAG: s_load_dwordx16
-; GCN-DAG: s_load_dwordx16
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mfma_f32_32x32x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(<32 x float> addrspace(1)* %arg) {
-bb:
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
-  %a = bitcast i32 1 to <2 x i16>
-  %b = bitcast i32 2 to <2 x i16>
-  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_mfma_f32_16x16x2bf16:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN: s_load_dwordx16
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mfma_f32_16x16x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(<16 x float> addrspace(1)* %arg) {
-bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
-  %a = bitcast i32 1 to <2 x i16>
-  %b = bitcast i32 2 to <2 x i16>
-  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_mfma_f32_4x4x2bf16:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN: s_load_dwordx4
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mfma_f32_4x4x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(<4 x float> addrspace(1)* %arg) {
-bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
-  %a = bitcast i32 1 to <2 x i16>
-  %b = bitcast i32 2 to <2 x i16>
-  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN: s_load_dwordx16
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mfma_f32_32x32x4bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(<16 x float> addrspace(1)* %arg) {
-bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
-  %a = bitcast i32 1 to <2 x i16>
-  %b = bitcast i32 2 to <2 x i16>
-  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_mfma_f32_16x16x8bf16:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN: s_load_dwordx4
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_mfma_f32_16x16x8bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(<4 x float> addrspace(1)* %arg) {
-bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
-  %a = bitcast i32 1 to <2 x i16>
-  %b = bitcast i32 2 to <2 x i16>
-  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
-  ret void
-}
-
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_forward_acc:
-; GCN:      v_mfma_f32_32x32x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
+; GFX908_A:      v_mfma_f32_32x32x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
+; GFX908_A-NEXT: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(<32 x float> addrspace(1)* %arg) {
 bb:
   %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
@@ -968,8 +345,8 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_forward_acc:
-; GCN:      v_mfma_f32_16x16x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
-; GCN-NEXT: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
+; GFX908_A:      v_mfma_f32_16x16x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
+; GFX908_A-NEXT: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
 define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(<16 x float> addrspace(1)* %arg) {
 bb:
   %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
@@ -980,8 +357,8 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_forward_acc:
-; GCN:      v_mfma_f32_4x4x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
-; GCN-NEXT: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
+; GFX908_A:      v_mfma_f32_4x4x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
+; GFX908_A-NEXT: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
 define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(<4 x float> addrspace(1)* %arg) {
 bb:
   %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
@@ -992,19 +369,19 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_imm_splat:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
+; GCN-DAG:        v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
+; GCN-DAG:        v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
 ; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
 ; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
 ; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
 ; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; NOLIT-SRCC: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}]
-; LIT-SRCC: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0
-; GCN: v_accvgpr_read_b32
-; GCN: v_accvgpr_read_b32
-; GCN: v_accvgpr_read_b32
-; GCN: v_accvgpr_read_b32
-; GCN: global_store_dwordx4
+; NOLIT-SRCC:     v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}]
+; LIT-SRCC:       v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], 1.0
+; GFX90A:         v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], 1.0
+; GFX908-COUNT-4: v_accvgpr_read_b32
+; GFX908:         global_store_dwordx4
+; GFX90A-NOT:     v_accvgpr_read_b32
+; GFX90A:         global_store_dwordx4 {{v[0-9]+}}, [[RES]],
 define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(<4 x float> addrspace(1)* %arg) {
 bb:
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
@@ -1013,31 +390,16 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_imm_splat:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
-; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; NOLIT-SRCC: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}]
-; LIT-SRCC: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
+; GCN-DAG:         v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
+; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
+; NOLIT-SRCC-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
+; NOLIT-SRCC:      v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}]
+; LIT-SRCC:        v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0
+; GFX90A:          v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0
+; GFX908-COUNT-16: v_accvgpr_read_b32
+; GFX908:          global_store_dwordx4
+; GFX90A-NOT:      v_accvgpr_read_b32
+; GFX90A-COUNT-4:  global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
 define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(<16 x float> addrspace(1)* %arg) {
 bb:
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
@@ -1046,31 +408,16 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16_imm_splat:
-; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 0x40004000
-; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 0x3c003c00
-; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; NOLIT-SRCC: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], a[{{[0-9:]+}}]
-; LIT-SRCC: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], 1.0
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
+; GCN-DAG:         v_mov_b32_e32 v[[TWO:[0-9]+]], 0x40004000
+; GCN-DAG:         v_mov_b32_e32 v[[ONE:[0-9]+]], 0x3c003c00
+; NOLIT-SRCC-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
+; NOLIT-SRCC:      v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], a[{{[0-9:]+}}]
+; LIT-SRCC:        v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], 1.0
+; GFX90A:          v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], 1.0
+; GFX908-COUNT-16: v_accvgpr_read_b32
+; GFX908:          global_store_dwordx4
+; GFX90A-NOT:      v_accvgpr_read_b32
+; GFX90A-COUNT-4:  global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
 define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(<16 x float> addrspace(1)* %arg) {
 bb:
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
@@ -1079,51 +426,16 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm_splat:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
-; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; NOLIT-SRCC: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}]
-; LIT-SRCC: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
+; GCN-DAG:         v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
+; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
+; NOLIT-SRCC-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; NOLIT-SRCC:      v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}]
+; LIT-SRCC:        v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0
+; GFX90A:          v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0
+; GFX908-COUNT-32: v_accvgpr_read_b32
+; GFX908:          global_store_dwordx4
+; GFX90A-NOT:      v_accvgpr_read_b32
+; GFX90A-COUNT-8:  global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(<32 x float> addrspace(1)* %arg) {
 bb:
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
@@ -1132,16 +444,17 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_imm:
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 2.0
-; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
-; GCN: v_accvgpr_read_b32
-; GCN: v_accvgpr_read_b32
-; GCN: v_accvgpr_read_b32
-; GCN: v_accvgpr_read_b32
-; GCN: global_store_dwordx4
+; GCN-DAG:        v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
+; GCN-DAG:        v_accvgpr_write_b32 a{{[0-9]+}}, 2.0
+; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
+; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
+; GFX90A-DAG:     v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:     v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX908_A:       v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
+; GFX908-COUNT-4: v_accvgpr_read_b32
+; GFX908:         global_store_dwordx4
+; GFX90A-NOT:     v_accvgpr_read_b32
+; GFX90A:         global_store_dwordx4 {{v[0-9]+}}, [[RES]],
 define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(<4 x float> addrspace(1)* %arg) {
 bb:
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 2.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
@@ -1150,43 +463,15 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_imm:
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 2.0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; GCN: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, 2.0
+; GFX908-COUNT-14: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
+; GFX90A-COUNT-14: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX908_A:        v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
+; GFX908-COUNT-16: v_accvgpr_read_b32
+; GFX908:          global_store_dwordx4
+; GFX90A-NOT:      v_accvgpr_read_b32
+; GFX90A-COUNT-4:  global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
 define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(<16 x float> addrspace(1)* %arg) {
 bb:
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 2.0>, i32 0, i32 0, i32 0)
@@ -1195,79 +480,73 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm:
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: v_accvgpr_read_b32
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
-; GCN-DAG: global_store_dwordx4
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
+; GFX908_A:        v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
+; GFX908-COUNT-32: v_accvgpr_read_b32
+; GFX908:          global_store_dwordx4
+; GFX90A-NOT:      v_accvgpr_read_b32
+; GFX90A-COUNT-8:  global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(<32 x float> addrspace(1)* %arg) {
 bb:
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
@@ -1276,17 +555,19 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat:
-; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
-; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
-; GCN: v_accvgpr_read_b32
-; GCN: v_accvgpr_read_b32
-; GCN: v_accvgpr_read_b32
-; GCN: v_accvgpr_read_b32
-; GCN: global_store_dwordx4
+; GFX908_A:       v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000
+; GCN:            v_accvgpr_write_b32 [[TTMPA:a[0-9]+]], [[TMP]]
+; GFX908:         v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX908:         v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX908:         v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A:         v_accvgpr_mov_b32 a{{[0-9]+}}, [[TTMPA]]
+; GFX90A:         v_accvgpr_mov_b32 a{{[0-9]+}}, [[TTMPA]]
+; GFX90A:         v_accvgpr_mov_b32 a{{[0-9]+}}, [[TTMPA]]
+; GFX908_A:       v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
+; GFX908-COUNT-4: v_accvgpr_read_b32
+; GFX908:         global_store_dwordx4
+; GFX90A-NOT:     v_accvgpr_read_b32
+; GFX90A:         global_store_dwordx4 {{v[0-9]+}}, [[RES]]
 define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(<4 x float> addrspace(1)* %arg, i64 %idx) {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1301,22 +582,21 @@ bb:
 ; in the middle of the expanded agpr reg_sequence. The broadcast of
 ; the individual AGPR->AGPR components should avoid the intermediate AGPR case.
 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat_bad_code:
-; GCN: v_mov_b32_e32 [[VTMP0:v[0-9]+]], 0x42f60000
-; GCN: v_accvgpr_write_b32 [[AGPR:a[0-9]+]], [[VTMP0]]
-; GCN: s_nop 0
-; GCN: v_accvgpr_read_b32 [[VTMP1:v[0-9]+]], [[AGPR]]
-; GCN: v_accvgpr_read_b32 [[VTMP2:v[0-9]+]], [[AGPR]]
-; GCN: v_accvgpr_read_b32 [[VTMP3:v[0-9]+]], [[AGPR]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[VTMP1]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[VTMP2]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[VTMP3]]
+; GFX908_A: v_mov_b32_e32 [[TMP0:v[0-9]+]], 0x42f60000
+; GCN:      v_accvgpr_write_b32 [[AGPR:a[0-9]+]], [[TMP0]]
+; GFX908:   s_nop 0
+; GFX908:   v_accvgpr_read_b32 [[TMP1:v[0-9]+]], [[AGPR]]
+; GFX908:   v_accvgpr_read_b32 [[TMP2:v[0-9]+]], [[AGPR]]
+; GFX908:   v_accvgpr_read_b32 [[TMP3:v[0-9]+]], [[AGPR]]
+; GFX908:   v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
+; GFX908:   v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GFX908:   v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+; GFX90A-COUNT-3: v_accvgpr_mov_b32 a{{[0-9]+}}, [[AGPR]]
 ; GCN: s_nop 0
-; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
-; GCN: v_accvgpr_read_b32
-; GCN: v_accvgpr_read_b32
-; GCN: v_accvgpr_read_b32
-; GCN: v_accvgpr_read_b32
-; GCN: global_store_dwordx4
+; GFX908_A:  v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
+; GFX908-COUNT-4: v_accvgpr_read_b32
+; GFX908:    global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}]
+; GFX90A:    global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(<4 x float> addrspace(1)* %arg) {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1328,35 +608,18 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_vecarg:
-; GCN-DAG:         v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
+; GFX90A-DAG:      v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
 ; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
-; GCN-DAG:         global_load_dwordx4
-; GCN-DAG:         global_load_dwordx4
-; GCN-DAG:         global_load_dwordx4
-; GCN-DAG:         global_load_dwordx4
-; GCN-DAG:         global_load_dwordx4
-; GCN-DAG:         global_load_dwordx4
-; GCN-DAG:         global_load_dwordx4
-; GCN-DAG:         global_load_dwordx4
-; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN:             v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-COUNT-32:    v_accvgpr_read_b32
-; GCN-COUNT-8:     global_store_dwordx4
+; GCN-COUNT-8:     global_load_dwordx4
+; GFX908-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX908-DAG:      v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
+; GFX90A-NOT:      v_accvgpr_write
+; GFX908:          v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX90A:          v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX908-COUNT-32: v_accvgpr_read_b32
+; GFX908-COUNT-8:  global_store_dwordx4
+; GFX90A-NOT:      v_accvgpr_read_b32
+; GFX90A-COUNT-5:  global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(<32 x float> addrspace(1)* %arg) {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
index 836f68053301..9bb9275335c7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
@@ -4,6 +4,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA %s
 ; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2 %s
 ; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i32 @llvm.amdgcn.workitem.id.y() #0
@@ -17,7 +18,10 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0
 ; CO-V2: enable_vgpr_workitem_id = 0
 
 ; ALL-NOT: v0
-; ALL: {{buffer|flat}}_store_dword {{.*}}v0
+; ALL: {{buffer|flat|global}}_store_dword {{.*}}v0
+
+; CO-V3: .amdhsa_system_vgpr_workitem_id 0
+; PACKED-TID: .amdhsa_system_vgpr_workitem_id 0
 define amdgpu_kernel void @test_workitem_id_x(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   store i32 %id, i32 addrspace(1)* %out
@@ -31,8 +35,14 @@ define amdgpu_kernel void @test_workitem_id_x(i32 addrspace(1)* %out) #1 {
 ; ALL-LABEL: {{^}}test_workitem_id_y:
 ; CO-V2: enable_vgpr_workitem_id = 1
 
-; ALL-NOT: v1
-; ALL: {{buffer|flat}}_store_dword {{.*}}v1
+; PACKED-TID: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10
+; PACKED-TID: {{buffer|flat|global}}_store_dword {{.*}}[[ID]]
+
+; UNPACKED-TID-NOT: v1
+; UNPACKED-TID: {{buffer|flat}}_store_dword {{.*}}v1
+
+; CO-V3: .amdhsa_system_vgpr_workitem_id 1
+; PACKED-TID: .amdhsa_system_vgpr_workitem_id 1
 define amdgpu_kernel void @test_workitem_id_y(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.y()
   store i32 %id, i32 addrspace(1)* %out
@@ -46,8 +56,14 @@ define amdgpu_kernel void @test_workitem_id_y(i32 addrspace(1)* %out) #1 {
 ; ALL-LABEL: {{^}}test_workitem_id_z:
 ; CO-V2: enable_vgpr_workitem_id = 2
 
-; ALL-NOT: v2
-; ALL: {{buffer|flat}}_store_dword {{.*}}v2
+; PACKED-TID: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10
+; PACKED-TID: {{buffer|flat|global}}_store_dword {{.*}}[[ID]]
+
+; UNPACKED-TID-NOT: v2
+; UNPACKED-TID: {{buffer|flat}}_store_dword {{.*}}v2
+
+; CO-V3: .amdhsa_system_vgpr_workitem_id 2
+; PACKED-TID: .amdhsa_system_vgpr_workitem_id 2
 define amdgpu_kernel void @test_workitem_id_z(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.z()
   store i32 %id, i32 addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.pow-gfx9.ll b/llvm/test/CodeGen/AMDGPU/llvm.pow-gfx9.ll
new file mode 100644
index 000000000000..d1df4139a5a5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.pow-gfx9.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=amdgcn -mcpu=gfx908 | FileCheck %s --check-prefixes=GCN,GFX908
+; RUN: llc < %s -march=amdgcn -mcpu=gfx90a | FileCheck %s --check-prefixes=GCN,GFX90A
+
+; GCN-LABEL: {{^}}mul_legacy
+; GFX908: v_mul_legacy_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+; GFX90A: v_mul_legacy_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @mul_legacy(
+    float addrspace(1)* %r,
+    float addrspace(1)* %a,
+    float addrspace(1)* %b) {
+entry:
+  %a.val = load volatile float, float addrspace(1)* %a
+  %b.val = load volatile float, float addrspace(1)* %b
+  %r.val = call float @llvm.pow.f32(float %a.val, float %b.val)
+  store float %r.val, float addrspace(1)* %r
+  ret void
+}
+
+declare float @llvm.pow.f32(float ,float ) readonly

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir
index a5279399804a..d6da7cb45fc7 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir
+++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir
@@ -203,7 +203,7 @@ body:             |
   ; CHECK: bb.1:
   ; CHECK:   successors: %bb.2(0x80000000)
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term1]]
-  ; CHECK:   dead %7:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1)
+  ; CHECK:   dead %7:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1)
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[COPY3]]
   ; CHECK: bb.2:
   ; CHECK:   successors: %bb.3(0x80000000)
@@ -233,7 +233,7 @@ body:             |
 
   bb.1:
     %11:sreg_64_xexec = COPY %13
-    dead %6:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1)
+    dead %6:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1)
     %14:sreg_64_xexec = COPY %11
 
   bb.2:

diff  --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
new file mode 100644
index 000000000000..305d30fd1afe
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir
@@ -0,0 +1,1310 @@
+# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: name: valu_write_vgpr_sgemm_mfma_read
+# GCN:      V_MOV_B32
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MFMA
+name:            valu_write_vgpr_sgemm_mfma_read
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: valu_write_agpr_sgemm_mfma_read
+# GCN:      V_ACCVGPR_WRITE_B32_e64
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MFMA
+name:            valu_write_agpr_sgemm_mfma_read
+body:             |
+  bb.0:
+    $vgpr0 = IMPLICIT_DEF
+    $agpr4 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $agpr4, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: valu_write_vgpr_dgemm_mfma_read
+# GCN:      V_MOV_B32
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MFMA
+name:            valu_write_vgpr_dgemm_mfma_read
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: accmov_write_agpr_sgemm_mfma_read
+# GCN:      V_ACCVGPR_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MFMA
+name:            accmov_write_agpr_sgemm_mfma_read
+body:             |
+  bb.0:
+    $vgpr0 = IMPLICIT_DEF
+    $agpr4 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $agpr4, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm_mfma_write_agpr_mfma_read_same_agpr_as_srcc
+# GCN:      V_MFMA
+# GCN-NEXT: V_MFMA
+name:            sgemm_mfma_write_agpr_mfma_read_same_agpr_as_srcc
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc
+# GCN:      V_MFMA
+# GCN-NEXT: V_MFMA
+name:            sgemm_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr4, $vgpr5, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr4, $vgpr5, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc
+# GCN:      V_MFMA
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc
+body:             |
+  bb.0:
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm4x4_mfma_write_agpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MFMA
+name:            sgemm4x4_mfma_write_agpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm4x4_mfma_write_vgpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MFMA
+name:            sgemm4x4_mfma_write_vgpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr6, $vgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr6, $vgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm16x16_mfma_write_agpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: V_MFMA
+name:            sgemm16x16_mfma_write_agpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17 = V_MFMA_F32_16X16X1F32_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm16x16_mfma_write_vgpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: V_MFMA
+name:            sgemm16x16_mfma_write_vgpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr26, $vgpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr26, $vgpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm32x32_mfma_write_agpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: V_MFMA
+name:            sgemm32x32_mfma_write_agpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17 = V_MFMA_F32_32X32X2F32_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm32x32_mfma_write_vgpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: V_MFMA
+name:            sgemm32x32_mfma_write_vgpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr26, $vgpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr26, $vgpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_vgpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_vgpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_vgpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_vgpr_sgemm_mfma_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr10, $vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_vgpr_sgemm_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_vgpr_sgemm_mfma_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr10, $vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm4x4_mfma_write_vgpr_dgemm_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            sgemm4x4_mfma_write_vgpr_dgemm_mfma_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr10, $vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr26, $vgpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr20_vgpr21, $vgpr20_vgpr21, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr26, $vgpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr20_vgpr21, $vgpr20_vgpr21, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm16x16_mfma_write_agpr_mfma_read_partial
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: V_MFMA
+name:            sgemm16x16_mfma_write_agpr_mfma_read_partial
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm16x16_mfma_write_vgpr_mfma_read_partial
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: V_MFMA
+name:            sgemm16x16_mfma_write_vgpr_mfma_read_partial
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm4x4_mfma_write_agpr_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            sgemm4x4_mfma_write_agpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $agpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr20_agpr21_agpr22_agpr23 = V_MFMA_F32_4X4X1F32_e64 $agpr1, $vgpr0, $agpr20_agpr21_agpr22_agpr23, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr20_agpr21_agpr22_agpr23 = V_MFMA_F32_4X4X1F32_e64 $agpr1, $vgpr0, $agpr20_agpr21_agpr22_agpr23, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm4x4_mfma_write_vgpr_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            sgemm4x4_mfma_write_vgpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr0, $agpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm4x4_mfma_write_vgpr_dmfma4x4_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            sgemm4x4_mfma_write_vgpr_dmfma4x4_srca_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr4_vgpr5, $vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm4x4_mfma_write_vgpr_dmfma16x16_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            sgemm4x4_mfma_write_vgpr_dmfma16x16_srca_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr4_vgpr5, $vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_vgpr_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_vgpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr2_vgpr3, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr4, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr4, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm4x4_mfma_write_vgpr_dgemm_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            sgemm4x4_mfma_write_vgpr_dgemm_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr4, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr4_vgpr5, $vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm4x4_mfma_write_agpr_mfma_srcb_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            sgemm4x4_mfma_write_agpr_mfma_srcb_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $agpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm4x4_mfma_write_vgpr_mfma_srcb_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            sgemm4x4_mfma_write_vgpr_mfma_srcb_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $agpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_vgpr_mfma_srcb_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_vgpr_mfma_srcb_read_overlap
+body:             |
+  bb.0:
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr0_vgpr1, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma4x4_write_vgpr_vm_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: BUFFER_STORE_DWORD
+name:            smfma4x4_write_vgpr_vm_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma4x4_write_vgpr_flat_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: FLAT_STORE_DWORD
+name:            smfma4x4_write_vgpr_flat_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: smfma4x4_write_vgpr_lds_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: DS_WRITE_B32
+name:            smfma4x4_write_vgpr_lds_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    DS_WRITE_B32 $vgpr0, $vgpr4, 0, 0, implicit $m0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma4x4_write_vgpr_exp_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: EXP_DONE
+name:            smfma4x4_write_vgpr_exp_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    EXP_DONE 12, $vgpr4, $vgpr0, $vgpr0, $vgpr0, 0, 0, 15, implicit $exec
+...
+# GCN-LABEL: name: smfma16x16_write_vgpr_flat_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: FLAT_STORE_DWORD
+name:            smfma16x16_write_vgpr_flat_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    FLAT_STORE_DWORD $vgpr16_vgpr17, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: smfma32x32_write_vgpr_flat_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: FLAT_STORE_DWORD
+name:            smfma32x32_write_vgpr_flat_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    FLAT_STORE_DWORD $vgpr16_vgpr17, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: FLAT_STORE_DWORD
+name:            dmfma4x4_write_vgpr_flat_read_overlap
+body:             |
+  bb.0:
+    $vgpr3_vgpr4 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_full
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: FLAT_STORE_DWORD
+name:            dmfma4x4_write_vgpr_flat_read_full
+body:             |
+  bb.0:
+    $vgpr3_vgpr4 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    FLAT_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: dmfma16x16_write_vgpr_flat_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: FLAT_STORE_DWORD
+name:            dmfma16x16_write_vgpr_flat_read
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: smfma4x4_write_vgpr_valu_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MOV_B32
+name:            smfma4x4_write_vgpr_valu_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma16x16_write_vgpr_valu_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32
+name:            smfma16x16_write_vgpr_valu_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma32x32_write_vgpr_valu_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32
+name:            smfma32x32_write_vgpr_valu_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dmfma4x4_write_vgpr_valu_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_MOV_B32
+name:            dmfma4x4_write_vgpr_valu_read
+body:             |
+  bb.0:
+    $vgpr3_vgpr4 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr5 = V_MOV_B32_e32 $vgpr4, implicit $exec
+...
+# GCN-LABEL: name: dmfma16x16_write_vgpr_valu_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32
+name:            dmfma16x16_write_vgpr_valu_read
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr12 = V_MOV_B32_e32 $vgpr4, implicit $exec
+...
+# GCN-LABEL: name: smfma4x4_write_vgpr_accv_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
+name:            smfma4x4_write_vgpr_accv_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma16x16_write_vgpr_accv_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
+name:            smfma16x16_write_vgpr_accv_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma32x32_write_vgpr_accv_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
+name:            smfma32x32_write_vgpr_accv_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma4x4_write_vgpr_dot_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_DOT
+name:            smfma4x4_write_vgpr_dot_read
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+...
+# GCN-LABEL: name: dmfma4x4_write_vgpr_dot_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_DOT
+name:            dmfma4x4_write_vgpr_dot_read
+body:             |
+  bb.0:
+    $vgpr3_vgpr4 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr4, $vgpr1, implicit $exec
+...
+# GCN-LABEL: name: dmfma16x16_write_vgpr_dot_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_DOT
+name:            dmfma16x16_write_vgpr_dot_read
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr4, $vgpr1, implicit $exec
+...
+# GCN-LABEL: name: smfma4x4_write_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MOV_B32
+name:            smfma4x4_write_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr4, $vgpr0, $vgpr6_vgpr7_vgpr8_vgpr9, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma16x16_write_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32
+name:            smfma16x16_write_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma32x32_write_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32
+name:            smfma32x32_write_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma4x4_write_vgpr_valu_f16_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_FMA_F16_e64
+name:            smfma4x4_write_vgpr_valu_f16_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_FMA_F16_e64 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma16x16_write_vgpr_valu_f16_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_FMA_F16_e64
+name:            smfma16x16_write_vgpr_valu_f16_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_FMA_F16_e64 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma32x32_write_vgpr_valu_f16_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_FMA_F16_e64
+name:            smfma32x32_write_vgpr_valu_f16_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_FMA_F16_e64 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma4x4_write_vgpr_valu_sdwa_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MOV_B32_sdwa
+name:            smfma4x4_write_vgpr_valu_sdwa_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_sdwa 0, $vgpr16, 0, 5, 2, 4, implicit $exec, implicit $vgpr1(tied-def 0)
+...
+# GCN-LABEL: name: smfma16x16_write_vgpr_valu_sdwa_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32_sdwa
+name:            smfma16x16_write_vgpr_valu_sdwa_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_sdwa 0, $vgpr16, 0, 5, 2, 4, implicit $exec, implicit $vgpr1(tied-def 0)
+...
+# GCN-LABEL: name: smfma32x32_write_vgpr_valu_sdwa_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32_sdwa
+name:            smfma32x32_write_vgpr_valu_sdwa_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_sdwa 0, $vgpr16, 0, 5, 2, 4, implicit $exec, implicit $vgpr1(tied-def 0)
+...
+# GCN-LABEL: name: dmfma4x4_write_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_MOV_B32
+name:            dmfma4x4_write_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr3_vgpr4 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dmfma16x16_write_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32
+name:            dmfma16x16_write_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma4x4_write_vgpr_accv_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_ACCVGPR_READ_B32_e64
+name:            smfma4x4_write_vgpr_accv_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma4x4_write_vgpr_dot_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_DOT
+name:            smfma4x4_write_vgpr_dot_write
+body:             |
+  bb.0:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma4x4_read_srcc_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MOV_B32
+name:            smfma4x4_read_srcc_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr8, $vgpr9, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma16x16_read_srcc_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 6
+# GCN-NEXT: V_MOV_B32
+name:            smfma16x16_read_srcc_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr18, $vgpr19, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma32x32_read_srcc_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 6
+# GCN-NEXT: V_MOV_B32
+name:            smfma32x32_read_srcc_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr0, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma4x4_read_srca_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            smfma4x4_read_srca_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr8, $vgpr9, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr8 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma16x16_read_srca_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            smfma16x16_read_srca_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr18, $vgpr19, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr18 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma32x32_read_srca_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            smfma32x32_read_srca_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr18, $vgpr19, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr18 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma4x4_read_srcb_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            smfma4x4_read_srcb_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr8, $vgpr9, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr9 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma16x16_read_srcb_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            smfma16x16_read_srcb_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr18, $vgpr19, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr19 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma32x32_read_srcb_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            smfma32x32_read_srcb_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr18, $vgpr19, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr19 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dmfma4x4_read_srcc_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            dmfma4x4_read_srcc_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr3_vgpr4 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dmfma16x16_read_srcc_vgpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_MOV_B32
+name:            dmfma16x16_read_srcc_vgpr_valu_write
+body:             |
+  bb.0:
+    $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: smfma16x16_read_srcc_vgpr_accv_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 6
+# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
+name:            smfma16x16_read_srcc_vgpr_accv_write
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17 = V_MFMA_F32_16X16X1F32_e64 $agpr18, $agpr19, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm_to_fma64
+# GCN:      V_MFMA
+# GCN-NEXT: V_FMA_F64_e64
+name:            sgemm_to_fma64
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4_vgpr5 = V_FMA_F64_e64 0, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm_to_fma64
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_FMA_F64_e64
+name:            dgemm_to_fma64
+body:             |
+  bb.0:
+    $vgpr0_vgpr1 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4_vgpr5 = V_FMA_F64_e64 0, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm_to_fmac64
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_FMAC_F64
+name:            dgemm_to_fmac64
+body:             |
+  bb.0:
+    $vgpr0_vgpr1 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4_vgpr5 = V_FMAC_F64_e32 $vgpr4_vgpr5, $vgpr4_vgpr5, $vgpr4_vgpr5, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: flat_store_data_agpr_overwritten
+# GCN:      FLAT_STORE_DWORDX4
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
+name: flat_store_data_agpr_overwritten
+body: |
+  bb.0:
+    FLAT_STORE_DWORDX4 $vgpr4_vgpr5, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+    $agpr0 = V_ACCVGPR_WRITE_B32_e64 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dot_write_vgpr_accv_read
+# GCN:      V_DOT
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
+name:            dot_write_vgpr_accv_read
+body:             |
+  bb.0:
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+    $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec
+...
+# GCN-LABEL: name: valu_write_vgpr_dot_read
+# GCN:      V_MOV_B32
+# GCN-NEXT: V_DOT
+name:            valu_write_vgpr_dot_read
+body:             |
+  bb.0:
+    $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+...
+# GCN-LABEL: name: accv_write_vgpr_dot_read
+# GCN:      V_ACCVGPR_READ
+# GCN-NEXT: V_DOT
+name:            accv_write_vgpr_dot_read
+body:             |
+  bb.0:
+    $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+...
+# GCN-LABEL: name: dot_write_vgpr_same_dot_read_srcc
+# GCN:      V_DOT
+# GCN-NEXT: V_DOT
+name:            dot_write_vgpr_same_dot_read_srcc
+body:             |
+  bb.0:
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+...
+# GCN-LABEL: name: dot_write_vgpr_
diff erent_dot_read_srcc
+# GCN:      V_DOT
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_DOT
+name:            dot_write_vgpr_
diff erent_dot_read_srcc
+body:             |
+  bb.0:
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+    $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dot_write_vgpr_
diff erent_dot_write
+# GCN:      V_DOT
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_DOT
+name:            dot_write_vgpr_
diff erent_dot_write
+body:             |
+  bb.0:
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+    $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dot_write_vgpr_
diff erent_valu_read
+# GCN:      V_DOT
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32_e32
+name:            dot_write_vgpr_
diff erent_valu_read
+body:             |
+  bb.0:
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec
+...
+# GCN-LABEL: name: dot_write_vgpr_
diff erent_valu_write
+# GCN:      V_DOT
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MOV_B32_e32
+name:            dot_write_vgpr_
diff erent_valu_write
+body:             |
+  bb.0:
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+...
+# GCN-LABEL: name: dot_write_vgpr_same_dot_read_srca
+# GCN:      V_DOT
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_DOT
+name:            dot_write_vgpr_same_dot_read_srca
+body:             |
+  bb.0:
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+    $vgpr0 = V_DOT4C_I32_I8_e32 $vgpr4, $vgpr1, $vgpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dot_write_vgpr_same_dot_read_srcb
+# GCN:      V_DOT
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_DOT
+name:            dot_write_vgpr_same_dot_read_srcb
+body:             |
+  bb.0:
+    $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec
+    $vgpr0 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr4, $vgpr0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: vcmpx_write_exec_mfma
+# GCN:      V_CMPX_EQ_I32_e32
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_MFMA
+name:            vcmpx_write_exec_mfma
+body:             |
+  bb.0:
+    implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $agpr8, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: valu_write_agpr_dgemm_mfma_read
+# GCN:      V_ACCVGPR_WRITE_B32_e64
+# GCN:      V_ACCVGPR_WRITE_B32_e64
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_MFMA
+name:            valu_write_agpr_dgemm_mfma_read
+body:             |
+  bb.0:
+    $agpr0 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec
+    $agpr1 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec
+    $agpr2_agpr3 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_read_same_agpr_as_srcc
+# GCN:      V_MFMA
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_agpr_mfma_read_same_agpr_as_srcc
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $agpr10_agpr11, $agpr10_agpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $agpr10_agpr11, $agpr10_agpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_agpr_mfma_read_same_agpr_as_srcc
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_agpr_mfma_read_same_agpr_as_srcc
+body:             |
+  bb.0:
+    $agpr2_agpr3 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $vgpr0_vgpr1, $agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr2_agpr3 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $vgpr0_vgpr1, $agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_agpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_agpr_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_agpr_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_sgemm_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_agpr_sgemm_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr10, $vgpr11, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm4x4_mfma_write_agpr_dgemm_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            sgemm4x4_mfma_write_agpr_dgemm_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr10, $vgpr11, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $vgpr20_vgpr21, $vgpr20_vgpr21, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_MFMA
+name:            sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $vgpr20_vgpr21, $vgpr20_vgpr21, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm4x4_mfma_write_agpr_dmfma4x4_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            sgemm4x4_mfma_write_agpr_dmfma4x4_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $agpr0_agpr1, $vgpr4_vgpr5, $vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm4x4_mfma_write_agpr_dmfma16x16_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            sgemm4x4_mfma_write_agpr_dmfma16x16_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $agpr0_agpr1, $vgpr4_vgpr5, $vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_agpr_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_agpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $agpr2_agpr3, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $agpr0_agpr1, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_agpr_sgemm_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_agpr_sgemm_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr4_agpr5 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $agpr4, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_sgemm_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_agpr_sgemm_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $agpr4, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: sgemm4x4_mfma_write_agpr_dgemm_mfma_srca_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 4
+# GCN-NEXT: V_MFMA
+name:            sgemm4x4_mfma_write_agpr_dgemm_mfma_srca_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr4, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $agpr0_agpr1, $vgpr4_vgpr5, $vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm4x4_mfma_write_agpr_mfma_srcb_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_MFMA
+name:            dgemm4x4_mfma_write_agpr_mfma_srcb_read_overlap
+body:             |
+  bb.0:
+    $agpr2_agpr3 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $agpr2_agpr3, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_MFMA
+name:            dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap
+body:             |
+  bb.0:
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $agpr0_agpr1, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_overlap
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: FLAT_STORE_DWORD
+name:            dmfma4x4_write_agpr_flat_read_overlap
+body:             |
+  bb.0:
+    $agpr3_agpr4 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    FLAT_STORE_DWORD $vgpr0_vgpr1, $agpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_full
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: FLAT_STORE_DWORD
+name:            dmfma4x4_write_agpr_flat_read_full
+body:             |
+  bb.0:
+    $agpr3_agpr4 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    FLAT_STORE_DWORDX2 $vgpr0_vgpr1, $agpr3_agpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: dmfma16x16_write_agpr_flat_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: FLAT_STORE_DWORD
+name:            dmfma16x16_write_agpr_flat_read
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    FLAT_STORE_DWORD $vgpr0_vgpr1, $agpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr
+...
+# GCN-LABEL: name: dmfma4x4_write_agpr_valu_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_ACCVGPR_READ_B32_e64
+name:            dmfma4x4_write_agpr_valu_read
+body:             |
+  bb.0:
+    $agpr3_agpr4 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec
+...
+# GCN-LABEL: name: dmfma16x16_write_agpr_valu_read
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_ACCVGPR_READ_B32_e64
+name:            dmfma16x16_write_agpr_valu_read
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec
+...
+# GCN-LABEL: name: dmfma4x4_write_agpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 5
+# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
+name:            dmfma4x4_write_agpr_valu_write
+body:             |
+  bb.0:
+    $agpr3_agpr4 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr3 = V_ACCVGPR_WRITE_B32_e64 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dmfma16x16_write_agpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 7
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
+name:            dmfma16x16_write_agpr_valu_write
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr3 = V_ACCVGPR_WRITE_B32_e64 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dmfma4x4_read_srcc_agpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
+name:            dmfma4x4_read_srcc_agpr_valu_write
+body:             |
+  bb.0:
+    $agpr3_agpr4 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dmfma16x16_read_srcc_agpr_valu_write
+# GCN:      V_MFMA
+# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
+name:            dmfma16x16_read_srcc_agpr_valu_write
+body:             |
+  bb.0:
+    $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm_accvgr_to_fma64
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_FMA_F64_e64
+name:            dgemm_accvgr_to_fma64
+body:             |
+  bb.0:
+    $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4_vgpr5 = V_FMA_F64_e64 0, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: dgemm_accvgr_to_fmac64
+# GCN:      V_MFMA
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_FMAC_F64
+name:            dgemm_accvgr_to_fmac64
+body:             |
+  bb.0:
+    $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr4_vgpr5 = V_FMAC_F64_e32 $vgpr4_vgpr5, $vgpr4_vgpr5, $vgpr4_vgpr5, implicit $mode, implicit $exec
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
index 7ec57caf2640..694fd7657529 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
@@ -312,7 +312,7 @@ name:            accvgpr_read_write_vgpr_load
 body:             |
   bb.0:
     $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
-    $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 ...
 ---
 
@@ -336,7 +336,7 @@ name:            accvgpr_read_write_vgpr_flat_load
 body:             |
   bb.0:
     $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
-    $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 ...
 ---
 
@@ -348,7 +348,7 @@ name:            accvgpr_read_write_vgpr_buffer_store
 body:             |
   bb.0:
     $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
-    BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 ...
 ---
 
@@ -373,7 +373,7 @@ body:             |
   bb.0:
     $vgpr0 = V_MOV_B32_e32 1, implicit $exec
     $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
-    $vgpr4 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr4 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 ...
 ---
 
@@ -387,7 +387,7 @@ body:             |
   bb.0:
     $vgpr0 = V_MOV_B32_e32 1, implicit $exec
     $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
-    $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 ...
 ---
 
@@ -401,7 +401,7 @@ body:             |
   bb.0:
     $vgpr0 = V_MOV_B32_e32 1, implicit $exec
     $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec
-    $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 ...
 ---
 
@@ -415,6 +415,6 @@ body:             |
   bb.0:
     $vgpr0 = V_MOV_B32_e32 1, implicit $exec
     $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
-    $vgpr4 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr4 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 ...
 ---

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
index 93a03c4bd1f4..3a19ec60aacb 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
@@ -81,7 +81,7 @@ body:             |
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 0
     S_WAITCNT 127
-    $vgpr1_vgpr2 = BUFFER_LOAD_DWORDX2_ADDR64 killed $vgpr1_vgpr2, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 8 from %ir.tid.gep)
+    $vgpr1_vgpr2 = BUFFER_LOAD_DWORDX2_ADDR64 killed $vgpr1_vgpr2, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 8 from %ir.tid.gep)
     $vgpr0 = V_XOR_B32_e32 1, killed $vgpr0, implicit $exec
     V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
     $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index 85dfa08bc461..2a425ad9cfb3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
 
 define amdgpu_kernel void @singlethread_acquire_fence() {
 ; GFX6-LABEL: singlethread_acquire_fence:
@@ -25,6 +27,16 @@ define amdgpu_kernel void @singlethread_acquire_fence() {
 ; SKIP-CACHE-INV-LABEL: singlethread_acquire_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: singlethread_acquire_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: singlethread_acquire_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("singlethread") acquire
   ret void
@@ -50,6 +62,16 @@ define amdgpu_kernel void @singlethread_release_fence() {
 ; SKIP-CACHE-INV-LABEL: singlethread_release_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: singlethread_release_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: singlethread_release_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("singlethread") release
   ret void
@@ -75,6 +97,16 @@ define amdgpu_kernel void @singlethread_acq_rel_fence() {
 ; SKIP-CACHE-INV-LABEL: singlethread_acq_rel_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: singlethread_acq_rel_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: singlethread_acq_rel_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("singlethread") acq_rel
   ret void
@@ -100,6 +132,16 @@ define amdgpu_kernel void @singlethread_seq_cst_fence() {
 ; SKIP-CACHE-INV-LABEL: singlethread_seq_cst_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: singlethread_seq_cst_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: singlethread_seq_cst_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("singlethread") seq_cst
   ret void
@@ -125,6 +167,16 @@ define amdgpu_kernel void @singlethread_one_as_acquire_fence() {
 ; SKIP-CACHE-INV-LABEL: singlethread_one_as_acquire_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: singlethread_one_as_acquire_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: singlethread_one_as_acquire_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("singlethread-one-as") acquire
   ret void
@@ -150,6 +202,16 @@ define amdgpu_kernel void @singlethread_one_as_release_fence() {
 ; SKIP-CACHE-INV-LABEL: singlethread_one_as_release_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: singlethread_one_as_release_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: singlethread_one_as_release_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("singlethread-one-as") release
   ret void
@@ -175,6 +237,16 @@ define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() {
 ; SKIP-CACHE-INV-LABEL: singlethread_one_as_acq_rel_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: singlethread_one_as_acq_rel_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: singlethread_one_as_acq_rel_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("singlethread-one-as") acq_rel
   ret void
@@ -200,6 +272,16 @@ define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() {
 ; SKIP-CACHE-INV-LABEL: singlethread_one_as_seq_cst_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: singlethread_one_as_seq_cst_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: singlethread_one_as_seq_cst_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("singlethread-one-as") seq_cst
   ret void
@@ -225,6 +307,16 @@ define amdgpu_kernel void @wavefront_acquire_fence() {
 ; SKIP-CACHE-INV-LABEL: wavefront_acquire_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: wavefront_acquire_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: wavefront_acquire_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("wavefront") acquire
   ret void
@@ -250,6 +342,16 @@ define amdgpu_kernel void @wavefront_release_fence() {
 ; SKIP-CACHE-INV-LABEL: wavefront_release_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: wavefront_release_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: wavefront_release_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("wavefront") release
   ret void
@@ -275,6 +377,16 @@ define amdgpu_kernel void @wavefront_acq_rel_fence() {
 ; SKIP-CACHE-INV-LABEL: wavefront_acq_rel_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: wavefront_acq_rel_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: wavefront_acq_rel_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("wavefront") acq_rel
   ret void
@@ -300,6 +412,16 @@ define amdgpu_kernel void @wavefront_seq_cst_fence() {
 ; SKIP-CACHE-INV-LABEL: wavefront_seq_cst_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: wavefront_seq_cst_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: wavefront_seq_cst_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("wavefront") seq_cst
   ret void
@@ -325,6 +447,16 @@ define amdgpu_kernel void @wavefront_one_as_acquire_fence() {
 ; SKIP-CACHE-INV-LABEL: wavefront_one_as_acquire_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: wavefront_one_as_acquire_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: wavefront_one_as_acquire_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("wavefront-one-as") acquire
   ret void
@@ -350,6 +482,16 @@ define amdgpu_kernel void @wavefront_one_as_release_fence() {
 ; SKIP-CACHE-INV-LABEL: wavefront_one_as_release_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: wavefront_one_as_release_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: wavefront_one_as_release_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("wavefront-one-as") release
   ret void
@@ -375,6 +517,16 @@ define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() {
 ; SKIP-CACHE-INV-LABEL: wavefront_one_as_acq_rel_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: wavefront_one_as_acq_rel_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: wavefront_one_as_acq_rel_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("wavefront-one-as") acq_rel
   ret void
@@ -400,6 +552,16 @@ define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() {
 ; SKIP-CACHE-INV-LABEL: wavefront_one_as_seq_cst_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: wavefront_one_as_seq_cst_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: wavefront_one_as_seq_cst_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("wavefront-one-as") seq_cst
   ret void
@@ -432,6 +594,19 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_acquire_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: workgroup_acquire_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("workgroup") acquire
   ret void
@@ -463,6 +638,18 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_release_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: workgroup_release_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("workgroup") release
   ret void
@@ -495,6 +682,19 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: workgroup_acq_rel_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("workgroup") acq_rel
   ret void
@@ -527,6 +727,19 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: workgroup_seq_cst_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("workgroup") seq_cst
   ret void
@@ -555,6 +768,18 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acquire_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acquire_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("workgroup-one-as") acquire
   ret void
@@ -582,6 +807,17 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: workgroup_one_as_release_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("workgroup-one-as") release
   ret void
@@ -610,6 +846,18 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("workgroup-one-as") acq_rel
   ret void
@@ -638,6 +886,18 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("workgroup-one-as") seq_cst
   ret void
@@ -676,6 +936,20 @@ define amdgpu_kernel void @agent_acquire_fence() {
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: agent_acquire_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: agent_acquire_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("agent") acquire
   ret void
@@ -708,6 +982,18 @@ define amdgpu_kernel void @agent_release_fence() {
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: agent_release_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: agent_release_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("agent") release
   ret void
@@ -746,6 +1032,20 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: agent_acq_rel_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: agent_acq_rel_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("agent") acq_rel
   ret void
@@ -784,6 +1084,20 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: agent_seq_cst_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: agent_seq_cst_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("agent") seq_cst
   ret void
@@ -822,6 +1136,20 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() {
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_acquire_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: agent_one_as_acquire_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("agent-one-as") acquire
   ret void
@@ -854,6 +1182,18 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_release_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: agent_one_as_release_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("agent-one-as") release
   ret void
@@ -892,6 +1232,20 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_acq_rel_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: agent_one_as_acq_rel_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("agent-one-as") acq_rel
   ret void
@@ -930,6 +1284,20 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_seq_cst_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: agent_one_as_seq_cst_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("agent-one-as") seq_cst
   ret void
@@ -968,6 +1336,26 @@ define amdgpu_kernel void @system_acquire_fence() {
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: system_acquire_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: system_acquire_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence acquire
   ret void
@@ -1000,6 +1388,20 @@ define amdgpu_kernel void @system_release_fence() {
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: system_release_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: system_release_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence release
   ret void
@@ -1038,6 +1440,26 @@ define amdgpu_kernel void @system_acq_rel_fence() {
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: system_acq_rel_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: system_acq_rel_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence acq_rel
   ret void
@@ -1076,6 +1498,26 @@ define amdgpu_kernel void @system_seq_cst_fence() {
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: system_seq_cst_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: system_seq_cst_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence seq_cst
   ret void
@@ -1114,6 +1556,26 @@ define amdgpu_kernel void @system_one_as_acquire_fence() {
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acquire_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: system_one_as_acquire_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("one-as") acquire
   ret void
@@ -1146,6 +1608,20 @@ define amdgpu_kernel void @system_one_as_release_fence() {
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: system_one_as_release_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: system_one_as_release_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("one-as") release
   ret void
@@ -1184,6 +1660,26 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acq_rel_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: system_one_as_acq_rel_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("one-as") acq_rel
   ret void
@@ -1222,6 +1718,26 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: system_one_as_seq_cst_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: system_one_as_seq_cst_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
 entry:
   fence syncscope("one-as") seq_cst
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index c43e579a94b1..5a1114c7e7e3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -3,6 +3,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
 
 define amdgpu_kernel void @flat_agent_unordered_load(
 ; GFX7-LABEL: flat_agent_unordered_load:
@@ -56,6 +58,32 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("agent") unordered, align 4
@@ -115,6 +143,32 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("agent") monotonic, align 4
@@ -182,6 +236,35 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("agent") acquire, align 4
@@ -255,6 +338,37 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("agent") seq_cst, align 4
@@ -308,6 +422,26 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("agent") unordered, align 4
@@ -360,6 +494,26 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("agent") monotonic, align 4
@@ -418,6 +572,28 @@ define amdgpu_kernel void @flat_agent_release_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("agent") release, align 4
@@ -476,6 +652,28 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("agent") seq_cst, align 4
@@ -528,6 +726,26 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") monotonic
@@ -591,6 +809,30 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acquire
@@ -649,6 +891,28 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") release
@@ -718,6 +982,32 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acq_rel
@@ -787,6 +1077,32 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") seq_cst
@@ -855,6 +1171,33 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acquire
@@ -930,6 +1273,35 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acq_rel
@@ -1005,6 +1377,35 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") seq_cst
@@ -1070,6 +1471,26 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1146,6 +1567,30 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1217,6 +1662,28 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1299,6 +1766,32 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1381,6 +1874,32 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1457,6 +1976,30 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1539,6 +2082,32 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1621,6 +2190,32 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1703,6 +2298,32 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1785,6 +2406,32 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1874,6 +2521,33 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1971,6 +2645,35 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2068,6 +2771,35 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2159,6 +2891,33 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2256,6 +3015,35 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2353,6 +3141,35 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2450,6 +3267,35 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2547,6 +3393,35 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2608,6 +3483,32 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("agent-one-as") unordered, align 4
@@ -2667,6 +3568,32 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("agent-one-as") monotonic, align 4
@@ -2735,6 +3662,35 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("agent-one-as") acquire, align 4
@@ -2809,6 +3765,37 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("agent-one-as") seq_cst, align 4
@@ -2862,6 +3849,26 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("agent-one-as") unordered, align 4
@@ -2914,6 +3921,26 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("agent-one-as") monotonic, align 4
@@ -2972,6 +3999,28 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("agent-one-as") release, align 4
@@ -3030,6 +4079,28 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("agent-one-as") seq_cst, align 4
@@ -3082,6 +4153,26 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") monotonic
@@ -3143,6 +4234,30 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire
@@ -3201,6 +4316,28 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") release
@@ -3268,6 +4405,32 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -3335,6 +4498,32 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -3403,6 +4592,33 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire
@@ -3478,6 +4694,35 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -3553,6 +4798,35 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -3618,6 +4892,26 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3692,6 +4986,30 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3763,6 +5081,28 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3843,6 +5183,32 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3923,6 +5289,32 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3997,6 +5389,30 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4077,6 +5493,32 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4157,6 +5599,32 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4237,6 +5705,32 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4317,6 +5811,32 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4407,6 +5927,33 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4505,6 +6052,35 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4603,6 +6179,35 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4695,6 +6300,33 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4793,6 +6425,35 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4891,6 +6552,35 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4989,6 +6679,35 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -5087,6 +6806,35 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index 162b5a105614..e44810eb8983 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -3,6 +3,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
 
 define amdgpu_kernel void @flat_nontemporal_load_0(
 ; GFX7-LABEL: flat_nontemporal_load_0:
@@ -56,6 +58,34 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_0:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc slc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_0:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc slc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %in, i32* %out) {
 entry:
   %val = load i32, i32* %in, align 4, !nontemporal !0
@@ -121,6 +151,38 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_1:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX90A-NOTTGSPLIT-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[2:3] glc slc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_1:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s1
+; GFX90A-TGSPLIT-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX90A-TGSPLIT-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[2:3] glc slc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %in, i32* %out) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -182,6 +244,34 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0 glc slc
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_0:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0 glc slc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_0:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0 glc slc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %in, i32* %out) {
 entry:
   %val = load i32, i32* %in, align 4
@@ -247,6 +337,38 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 glc slc
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_1:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 glc slc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_1:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[2:3]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX90A-TGSPLIT-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 glc slc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %in, i32* %out) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index 8e6758739af6..1095ea34230d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -3,6 +3,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
 
 define amdgpu_kernel void @flat_singlethread_unordered_load(
 ; GFX7-LABEL: flat_singlethread_unordered_load:
@@ -56,6 +58,34 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4
@@ -115,6 +145,34 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4
@@ -174,6 +232,34 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4
@@ -233,6 +319,34 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4
@@ -286,6 +400,28 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4
@@ -338,6 +474,28 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4
@@ -390,6 +548,28 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4
@@ -442,6 +622,28 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4
@@ -494,6 +696,28 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") monotonic
@@ -546,6 +770,28 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire
@@ -598,6 +844,28 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") release
@@ -650,6 +918,28 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel
@@ -702,6 +992,28 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst
@@ -762,6 +1074,32 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire
@@ -823,6 +1161,32 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel
@@ -884,6 +1248,32 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst
@@ -949,6 +1339,28 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1014,6 +1426,28 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1079,6 +1513,28 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1144,6 +1600,28 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1209,6 +1687,28 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1274,6 +1774,28 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1339,6 +1861,28 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1404,6 +1948,28 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1469,6 +2035,28 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1534,6 +2122,28 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1615,6 +2225,32 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1698,6 +2334,32 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1781,6 +2443,32 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1864,6 +2552,32 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1947,6 +2661,32 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2030,6 +2770,32 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2113,6 +2879,32 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2196,6 +2988,32 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2257,6 +3075,34 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("singlethread-one-as") unordered, align 4
@@ -2316,6 +3162,34 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("singlethread-one-as") monotonic, align 4
@@ -2375,6 +3249,34 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("singlethread-one-as") acquire, align 4
@@ -2434,6 +3336,34 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("singlethread-one-as") seq_cst, align 4
@@ -2487,6 +3417,28 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("singlethread-one-as") unordered, align 4
@@ -2539,6 +3491,28 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("singlethread-one-as") monotonic, align 4
@@ -2591,6 +3565,28 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("singlethread-one-as") release, align 4
@@ -2643,6 +3639,28 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("singlethread-one-as") seq_cst, align 4
@@ -2695,6 +3713,28 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") monotonic
@@ -2747,6 +3787,28 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -2799,6 +3861,28 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") release
@@ -2851,6 +3935,28 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -2903,6 +4009,28 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -2963,6 +4091,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -3024,6 +4178,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -3085,6 +4265,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -3150,6 +4356,28 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3215,6 +4443,28 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3280,6 +4530,28 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3345,6 +4617,28 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3410,6 +4704,28 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3475,6 +4791,28 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3540,6 +4878,28 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3605,6 +4965,28 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3670,6 +5052,28 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3735,6 +5139,28 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3816,6 +5242,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3899,6 +5351,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3982,6 +5460,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4065,6 +5569,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4148,6 +5678,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4231,6 +5787,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4314,6 +5896,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4397,6 +6005,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index e80b2c9a4ced..517c345b56bb 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -3,6 +3,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
 
 define amdgpu_kernel void @flat_system_unordered_load(
 ; GFX7-LABEL: flat_system_unordered_load:
@@ -56,6 +58,32 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in unordered, align 4
@@ -115,6 +143,32 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc scc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in monotonic, align 4
@@ -182,6 +236,38 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in acquire, align 4
@@ -255,6 +341,40 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in seq_cst, align 4
@@ -308,6 +428,26 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out unordered, align 4
@@ -360,6 +500,26 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out monotonic, align 4
@@ -418,6 +578,30 @@ define amdgpu_kernel void @flat_system_release_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out release, align 4
@@ -476,6 +660,30 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out seq_cst, align 4
@@ -528,6 +736,26 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in monotonic
@@ -591,6 +819,34 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in acquire
@@ -649,6 +905,30 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in release
@@ -718,6 +998,38 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in acq_rel
@@ -787,6 +1099,38 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst
@@ -855,6 +1199,36 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in acquire
@@ -930,6 +1304,40 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in acq_rel
@@ -1005,6 +1413,40 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst
@@ -1070,6 +1512,26 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1146,6 +1608,34 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1217,6 +1707,30 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1299,6 +1813,38 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1381,6 +1927,38 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1457,6 +2035,34 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1539,6 +2145,38 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1621,6 +2259,38 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1703,6 +2373,38 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1785,6 +2487,38 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1874,6 +2608,36 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1971,6 +2735,40 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2068,6 +2866,40 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2159,6 +2991,36 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2256,6 +3118,40 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2353,6 +3249,40 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2450,6 +3380,40 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2547,6 +3511,40 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2608,6 +3606,32 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("one-as") unordered, align 4
@@ -2667,6 +3691,32 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc scc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("one-as") monotonic, align 4
@@ -2735,6 +3785,39 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("one-as") acquire, align 4
@@ -2809,6 +3892,41 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("one-as") seq_cst, align 4
@@ -2862,6 +3980,26 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("one-as") unordered, align 4
@@ -2914,6 +4052,26 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("one-as") monotonic, align 4
@@ -2972,6 +4130,30 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("one-as") release, align 4
@@ -3030,6 +4212,30 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("one-as") seq_cst, align 4
@@ -3082,6 +4288,26 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") monotonic
@@ -3143,6 +4369,34 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire
@@ -3201,6 +4455,30 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") release
@@ -3268,6 +4546,38 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel
@@ -3335,6 +4645,38 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst
@@ -3403,6 +4745,37 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire
@@ -3478,6 +4851,41 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel
@@ -3553,6 +4961,41 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst
@@ -3618,6 +5061,26 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3692,6 +5155,34 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3763,6 +5254,30 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3843,6 +5358,38 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3923,6 +5470,38 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3997,6 +5576,34 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4077,6 +5684,38 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4157,6 +5796,38 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4237,6 +5908,38 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4317,6 +6020,38 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4407,6 +6142,37 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4505,6 +6271,41 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4603,6 +6404,41 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4695,6 +6531,37 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4793,6 +6660,41 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4891,6 +6793,41 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4989,6 +6926,41 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -5087,6 +7059,41 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index fc43c931043e..1dbc84808389 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -3,6 +3,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
 
 define amdgpu_kernel void @flat_wavefront_unordered_load(
 ; GFX7-LABEL: flat_wavefront_unordered_load:
@@ -56,6 +58,34 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("wavefront") unordered, align 4
@@ -115,6 +145,34 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("wavefront") monotonic, align 4
@@ -174,6 +232,34 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("wavefront") acquire, align 4
@@ -233,6 +319,34 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("wavefront") seq_cst, align 4
@@ -286,6 +400,28 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("wavefront") unordered, align 4
@@ -338,6 +474,28 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("wavefront") monotonic, align 4
@@ -390,6 +548,28 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("wavefront") release, align 4
@@ -442,6 +622,28 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("wavefront") seq_cst, align 4
@@ -494,6 +696,28 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") monotonic
@@ -546,6 +770,28 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire
@@ -598,6 +844,28 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") release
@@ -650,6 +918,28 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel
@@ -702,6 +992,28 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst
@@ -762,6 +1074,32 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire
@@ -823,6 +1161,32 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel
@@ -884,6 +1248,32 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst
@@ -949,6 +1339,28 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1014,6 +1426,28 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1079,6 +1513,28 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1144,6 +1600,28 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1209,6 +1687,28 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1274,6 +1774,28 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1339,6 +1861,28 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1404,6 +1948,28 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1469,6 +2035,28 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1534,6 +2122,28 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1615,6 +2225,32 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1698,6 +2334,32 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1781,6 +2443,32 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1864,6 +2552,32 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1947,6 +2661,32 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2030,6 +2770,32 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2113,6 +2879,32 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2196,6 +2988,32 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2257,6 +3075,34 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("wavefront-one-as") unordered, align 4
@@ -2316,6 +3162,34 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("wavefront-one-as") monotonic, align 4
@@ -2375,6 +3249,34 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("wavefront-one-as") acquire, align 4
@@ -2434,6 +3336,34 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("wavefront-one-as") seq_cst, align 4
@@ -2487,6 +3417,28 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("wavefront-one-as") unordered, align 4
@@ -2539,6 +3491,28 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("wavefront-one-as") monotonic, align 4
@@ -2591,6 +3565,28 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("wavefront-one-as") release, align 4
@@ -2643,6 +3639,28 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("wavefront-one-as") seq_cst, align 4
@@ -2695,6 +3713,28 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") monotonic
@@ -2747,6 +3787,28 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -2799,6 +3861,28 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") release
@@ -2851,6 +3935,28 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -2903,6 +4009,28 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -2963,6 +4091,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -3024,6 +4178,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -3085,6 +4265,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -3150,6 +4356,28 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3215,6 +4443,28 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3280,6 +4530,28 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3345,6 +4617,28 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3410,6 +4704,28 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3475,6 +4791,28 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3540,6 +4878,28 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3605,6 +4965,28 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3670,6 +5052,28 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3735,6 +5139,28 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3816,6 +5242,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3899,6 +5351,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3982,6 +5460,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4065,6 +5569,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4148,6 +5678,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4231,6 +5787,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4314,6 +5896,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4397,6 +6005,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index 942f574c9862..f29b5abaf099 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -3,6 +3,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
 
 define amdgpu_kernel void @flat_workgroup_unordered_load(
 ; GFX7-LABEL: flat_workgroup_unordered_load:
@@ -56,6 +58,32 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("workgroup") unordered, align 4
@@ -115,6 +143,32 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("workgroup") monotonic, align 4
@@ -179,6 +233,34 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("workgroup") acquire, align 4
@@ -248,6 +330,36 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("workgroup") seq_cst, align 4
@@ -301,6 +413,26 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("workgroup") unordered, align 4
@@ -353,6 +485,26 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("workgroup") monotonic, align 4
@@ -410,6 +562,28 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("workgroup") release, align 4
@@ -467,6 +641,28 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("workgroup") seq_cst, align 4
@@ -519,6 +715,26 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") monotonic
@@ -577,6 +793,29 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acquire
@@ -634,6 +873,28 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") release
@@ -697,6 +958,31 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acq_rel
@@ -760,6 +1046,31 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") seq_cst
@@ -822,6 +1133,31 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acquire
@@ -890,6 +1226,33 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acq_rel
@@ -958,6 +1321,33 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") seq_cst
@@ -1023,6 +1413,26 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1094,6 +1504,29 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1164,6 +1597,28 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1240,6 +1695,31 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1316,6 +1796,31 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1387,6 +1892,29 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1463,6 +1991,31 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1539,6 +2092,31 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1615,6 +2193,31 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1691,6 +2294,31 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1777,6 +2405,31 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1870,6 +2523,33 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -1963,6 +2643,33 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2051,6 +2758,31 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2144,6 +2876,33 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2237,6 +2996,33 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2330,6 +3116,33 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2423,6 +3236,33 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -2484,6 +3324,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("workgroup-one-as") unordered, align 4
@@ -2543,6 +3409,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("workgroup-one-as") monotonic, align 4
@@ -2604,6 +3496,33 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("workgroup-one-as") acquire, align 4
@@ -2667,6 +3586,34 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[2:3], v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %in, i32* %out) {
 entry:
   %val = load atomic i32, i32* %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -2720,6 +3667,26 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("workgroup-one-as") unordered, align 4
@@ -2772,6 +3739,26 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("workgroup-one-as") monotonic, align 4
@@ -2826,6 +3813,27 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("workgroup-one-as") release, align 4
@@ -2880,6 +3888,27 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32* %out) {
 entry:
   store atomic i32 %in, i32* %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -2932,6 +3961,26 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -2986,6 +4035,28 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -3040,6 +4111,27 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") release
@@ -3096,6 +4188,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -3152,6 +4267,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -3214,6 +4352,31 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -3279,6 +4442,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -3344,6 +4533,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -3409,6 +4624,26 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3476,6 +4711,28 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3543,6 +4800,27 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3612,6 +4890,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3681,6 +4982,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3748,6 +5072,28 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3817,6 +5163,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3886,6 +5255,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -3955,6 +5347,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4024,6 +5439,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4107,6 +5545,31 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4194,6 +5657,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4281,6 +5770,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4366,6 +5881,31 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4453,6 +5993,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4540,6 +6106,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4627,6 +6219,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4
@@ -4714,6 +6332,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32* %out, i32 4

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index 5e96e4d191db..fb2b7d8b10fc 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
 
 define amdgpu_kernel void @global_agent_unordered_load(
 ; GFX6-LABEL: global_agent_unordered_load:
@@ -72,6 +74,26 @@ define amdgpu_kernel void @global_agent_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") unordered, align 4
@@ -146,6 +168,26 @@ define amdgpu_kernel void @global_agent_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") monotonic, align 4
@@ -226,6 +268,28 @@ define amdgpu_kernel void @global_agent_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") acquire, align 4
@@ -311,6 +375,28 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") seq_cst, align 4
@@ -373,6 +459,26 @@ define amdgpu_kernel void @global_agent_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") unordered, align 4
@@ -434,6 +540,26 @@ define amdgpu_kernel void @global_agent_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") monotonic, align 4
@@ -502,6 +628,28 @@ define amdgpu_kernel void @global_agent_release_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") release, align 4
@@ -570,6 +718,28 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") seq_cst, align 4
@@ -631,6 +801,26 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") monotonic
@@ -703,6 +893,30 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acquire
@@ -771,6 +985,28 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") release
@@ -850,6 +1086,32 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acq_rel
@@ -929,6 +1191,32 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") seq_cst
@@ -1006,6 +1294,32 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acquire
@@ -1091,6 +1405,34 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acq_rel
@@ -1176,6 +1518,34 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") seq_cst
@@ -1245,6 +1615,26 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1325,6 +1715,30 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1401,6 +1815,28 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1488,6 +1924,32 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1575,6 +2037,32 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1655,6 +2143,30 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1742,6 +2254,32 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1829,6 +2367,32 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1916,6 +2480,32 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2003,6 +2593,32 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2090,6 +2706,32 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2186,6 +2828,34 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2282,6 +2952,34 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2371,6 +3069,32 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2467,6 +3191,34 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2563,6 +3315,34 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2659,6 +3439,34 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2755,6 +3563,34 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2831,6 +3667,26 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") unordered, align 4
@@ -2905,6 +3761,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") monotonic, align 4
@@ -2985,6 +3861,28 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") acquire, align 4
@@ -3070,6 +3968,28 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") seq_cst, align 4
@@ -3132,6 +4052,26 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") unordered, align 4
@@ -3193,6 +4133,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") monotonic, align 4
@@ -3261,6 +4221,28 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") release, align 4
@@ -3329,6 +4311,28 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") seq_cst, align 4
@@ -3390,6 +4394,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") monotonic
@@ -3462,6 +4486,30 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acquire
@@ -3530,6 +4578,28 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") release
@@ -3609,6 +4679,32 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -3688,6 +4784,32 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -3765,6 +4887,32 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acquire
@@ -3850,6 +4998,34 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -3935,6 +5111,34 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -4004,6 +5208,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4084,6 +5308,30 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4160,6 +5408,28 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4247,6 +5517,32 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4334,6 +5630,32 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4414,6 +5736,30 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4501,6 +5847,32 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4588,6 +5960,32 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4675,6 +6073,32 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4762,6 +6186,32 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4849,6 +6299,32 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4945,6 +6421,34 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5041,6 +6545,34 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5130,6 +6662,32 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5226,6 +6784,34 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5322,6 +6908,34 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5418,6 +7032,34 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5514,6 +7156,34 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index d88b87626f08..7ac4b3711328 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
 
 define amdgpu_kernel void @global_nontemporal_load_0(
 ; GFX6-LABEL: global_nontemporal_load_0:
@@ -67,6 +69,30 @@ define amdgpu_kernel void @global_nontemporal_load_0(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_load_0:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_nontemporal_load_0:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0
@@ -145,6 +171,30 @@ define amdgpu_kernel void @global_nontemporal_load_1(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_load_1:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v0, v0, s[0:1] glc slc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_nontemporal_load_1:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v0, v0, s[0:1] glc slc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -216,6 +266,30 @@ define amdgpu_kernel void @global_nontemporal_store_0(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0 glc slc
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_store_0:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] glc slc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_nontemporal_store_0:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] glc slc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load i32, i32 addrspace(1)* %in, align 4
@@ -289,6 +363,30 @@ define amdgpu_kernel void @global_nontemporal_store_1(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 glc slc
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_store_1:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] glc slc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_nontemporal_store_1:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] glc slc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index 426d8aa7d631..1b16cca31c61 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
 
 define amdgpu_kernel void @global_singlethread_unordered_load(
 ; GFX6-LABEL: global_singlethread_unordered_load:
@@ -72,6 +74,28 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") unordered, align 4
@@ -146,6 +170,28 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") monotonic, align 4
@@ -220,6 +266,28 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") acquire, align 4
@@ -294,6 +362,28 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") seq_cst, align 4
@@ -356,6 +446,28 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") unordered, align 4
@@ -417,6 +529,28 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") monotonic, align 4
@@ -478,6 +612,28 @@ define amdgpu_kernel void @global_singlethread_release_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") release, align 4
@@ -539,6 +695,28 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") seq_cst, align 4
@@ -600,6 +778,28 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") monotonic
@@ -661,6 +861,28 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acquire
@@ -722,6 +944,28 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") release
@@ -783,6 +1027,28 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acq_rel
@@ -844,6 +1110,28 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") seq_cst
@@ -915,6 +1203,32 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acquire
@@ -987,6 +1301,32 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acq_rel
@@ -1059,6 +1399,32 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") seq_cst
@@ -1128,6 +1494,28 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1197,6 +1585,28 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1266,6 +1676,28 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1335,6 +1767,28 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1404,6 +1858,28 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1473,6 +1949,28 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1542,6 +2040,28 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1611,6 +2131,28 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1680,6 +2222,28 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1749,6 +2313,28 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1830,6 +2416,32 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1913,6 +2525,32 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1996,6 +2634,32 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2079,6 +2743,32 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2162,6 +2852,32 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2245,6 +2961,32 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2328,6 +3070,32 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2411,6 +3179,32 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2487,6 +3281,28 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") unordered, align 4
@@ -2561,6 +3377,28 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") monotonic, align 4
@@ -2635,6 +3473,28 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") acquire, align 4
@@ -2709,6 +3569,28 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") seq_cst, align 4
@@ -2771,6 +3653,28 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") unordered, align 4
@@ -2832,6 +3736,28 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") monotonic, align 4
@@ -2893,6 +3819,28 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") release, align 4
@@ -2954,6 +3902,28 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") seq_cst, align 4
@@ -3015,6 +3985,28 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") monotonic
@@ -3076,6 +4068,28 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -3137,6 +4151,28 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") release
@@ -3198,6 +4234,28 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -3259,6 +4317,28 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -3330,6 +4410,32 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -3402,6 +4508,32 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -3474,6 +4606,32 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -3543,6 +4701,28 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3612,6 +4792,28 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3681,6 +4883,28 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3750,6 +4974,28 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3819,6 +5065,28 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3888,6 +5156,28 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3957,6 +5247,28 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4026,6 +5338,28 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4095,6 +5429,28 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4164,6 +5520,28 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4245,6 +5623,32 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4328,6 +5732,32 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4411,6 +5841,32 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4494,6 +5950,32 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4577,6 +6059,32 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4660,6 +6168,32 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4743,6 +6277,32 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4826,6 +6386,32 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index 7ed129cb77b7..3b128cc9c456 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
 
 define amdgpu_kernel void @global_system_unordered_load(
 ; GFX6-LABEL: global_system_unordered_load:
@@ -72,6 +74,26 @@ define amdgpu_kernel void @global_system_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in unordered, align 4
@@ -146,6 +168,26 @@ define amdgpu_kernel void @global_system_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in monotonic, align 4
@@ -226,6 +268,32 @@ define amdgpu_kernel void @global_system_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in acquire, align 4
@@ -311,6 +379,32 @@ define amdgpu_kernel void @global_system_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in seq_cst, align 4
@@ -373,6 +467,26 @@ define amdgpu_kernel void @global_system_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out unordered, align 4
@@ -434,6 +548,26 @@ define amdgpu_kernel void @global_system_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out monotonic, align 4
@@ -502,6 +636,30 @@ define amdgpu_kernel void @global_system_release_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out release, align 4
@@ -570,6 +728,30 @@ define amdgpu_kernel void @global_system_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4
@@ -631,6 +813,26 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in monotonic
@@ -703,6 +905,34 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acquire
@@ -771,6 +1001,30 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in release
@@ -850,6 +1104,38 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acq_rel
@@ -929,6 +1215,38 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
@@ -1006,6 +1324,36 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acquire
@@ -1091,6 +1439,40 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acq_rel
@@ -1176,6 +1558,40 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
@@ -1245,6 +1661,26 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1325,6 +1761,34 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1401,6 +1865,30 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1488,6 +1976,38 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1575,6 +2095,38 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1655,6 +2207,34 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1742,6 +2322,38 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1829,6 +2441,38 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1916,6 +2560,38 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2003,6 +2679,38 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2090,6 +2798,36 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2186,6 +2924,40 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2282,6 +3054,40 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2371,6 +3177,36 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2467,6 +3303,40 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2563,6 +3433,40 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2659,6 +3563,40 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2755,6 +3693,40 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2831,6 +3803,26 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") unordered, align 4
@@ -2905,6 +3897,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") monotonic, align 4
@@ -2985,6 +3997,32 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") acquire, align 4
@@ -3070,6 +4108,32 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") seq_cst, align 4
@@ -3132,6 +4196,26 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") unordered, align 4
@@ -3193,6 +4277,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") monotonic, align 4
@@ -3261,6 +4365,30 @@ define amdgpu_kernel void @global_system_one_as_release_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") release, align 4
@@ -3329,6 +4457,30 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") seq_cst, align 4
@@ -3390,6 +4542,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") monotonic
@@ -3462,6 +4634,34 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire
@@ -3530,6 +4730,30 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") release
@@ -3609,6 +4833,38 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel
@@ -3688,6 +4944,38 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst
@@ -3765,6 +5053,36 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire
@@ -3850,6 +5168,40 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel
@@ -3935,6 +5287,40 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst
@@ -4004,6 +5390,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4084,6 +5490,34 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4160,6 +5594,30 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4247,6 +5705,38 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4334,6 +5824,38 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4414,6 +5936,34 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4501,6 +6051,38 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4588,6 +6170,38 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4675,6 +6289,38 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4762,6 +6408,38 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4849,6 +6527,36 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4945,6 +6653,40 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5041,6 +6783,40 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5130,6 +6906,36 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5226,6 +7032,40 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5322,6 +7162,40 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5418,6 +7292,40 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5514,6 +7422,40 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_invl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index c86d05b1143e..e763e9d91572 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
 
 define amdgpu_kernel void @global_wavefront_unordered_load(
 ; GFX6-LABEL: global_wavefront_unordered_load:
@@ -72,6 +74,28 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") unordered, align 4
@@ -146,6 +170,28 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") monotonic, align 4
@@ -220,6 +266,28 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") acquire, align 4
@@ -294,6 +362,28 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") seq_cst, align 4
@@ -356,6 +446,28 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") unordered, align 4
@@ -417,6 +529,28 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") monotonic, align 4
@@ -478,6 +612,28 @@ define amdgpu_kernel void @global_wavefront_release_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") release, align 4
@@ -539,6 +695,28 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") seq_cst, align 4
@@ -600,6 +778,28 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") monotonic
@@ -661,6 +861,28 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acquire
@@ -722,6 +944,28 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") release
@@ -783,6 +1027,28 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acq_rel
@@ -844,6 +1110,28 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") seq_cst
@@ -915,6 +1203,32 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acquire
@@ -987,6 +1301,32 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acq_rel
@@ -1059,6 +1399,32 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") seq_cst
@@ -1128,6 +1494,28 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1197,6 +1585,28 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1266,6 +1676,28 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1335,6 +1767,28 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1404,6 +1858,28 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1473,6 +1949,28 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1542,6 +2040,28 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1611,6 +2131,28 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1680,6 +2222,28 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1749,6 +2313,28 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1830,6 +2416,32 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1913,6 +2525,32 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1996,6 +2634,32 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2079,6 +2743,32 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2162,6 +2852,32 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2245,6 +2961,32 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2328,6 +3070,32 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2411,6 +3179,32 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2487,6 +3281,28 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") unordered, align 4
@@ -2561,6 +3377,28 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") monotonic, align 4
@@ -2635,6 +3473,28 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") acquire, align 4
@@ -2709,6 +3569,28 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") seq_cst, align 4
@@ -2771,6 +3653,28 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") unordered, align 4
@@ -2832,6 +3736,28 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") monotonic, align 4
@@ -2893,6 +3819,28 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") release, align 4
@@ -2954,6 +3902,28 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") seq_cst, align 4
@@ -3015,6 +3985,28 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") monotonic
@@ -3076,6 +4068,28 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -3137,6 +4151,28 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") release
@@ -3198,6 +4234,28 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -3259,6 +4317,28 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -3330,6 +4410,32 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -3402,6 +4508,32 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -3474,6 +4606,32 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -3543,6 +4701,28 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3612,6 +4792,28 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3681,6 +4883,28 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3750,6 +4974,28 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3819,6 +5065,28 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3888,6 +5156,28 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3957,6 +5247,28 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4026,6 +5338,28 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4095,6 +5429,28 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4164,6 +5520,28 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4245,6 +5623,32 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4328,6 +5732,32 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4411,6 +5841,32 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4494,6 +5950,32 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4577,6 +6059,32 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4660,6 +6168,32 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4743,6 +6277,32 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4826,6 +6386,32 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index dc36f51ced7b..728fcfdf49b0 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
 
 define amdgpu_kernel void @global_workgroup_unordered_load(
 ; GFX6-LABEL: global_workgroup_unordered_load:
@@ -72,6 +74,26 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") unordered, align 4
@@ -146,6 +168,26 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") monotonic, align 4
@@ -221,6 +263,27 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") acquire, align 4
@@ -300,6 +363,27 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") seq_cst, align 4
@@ -362,6 +446,26 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") unordered, align 4
@@ -423,6 +527,26 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") monotonic, align 4
@@ -490,6 +614,28 @@ define amdgpu_kernel void @global_workgroup_release_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") release, align 4
@@ -557,6 +703,28 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") seq_cst, align 4
@@ -618,6 +786,26 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") monotonic
@@ -681,6 +869,28 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acquire
@@ -748,6 +958,28 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") release
@@ -817,6 +1049,30 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acq_rel
@@ -886,6 +1142,30 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst
@@ -958,6 +1238,31 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acquire
@@ -1037,6 +1342,33 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acq_rel
@@ -1116,6 +1448,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst
@@ -1185,6 +1544,26 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1256,6 +1635,28 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1331,6 +1732,28 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1408,6 +1831,30 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1485,6 +1932,30 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1556,6 +2027,28 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1633,6 +2126,30 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1710,6 +2227,30 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1787,6 +2328,30 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1864,6 +2429,30 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1946,6 +2535,31 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2036,6 +2650,33 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2126,6 +2767,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2210,6 +2878,31 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2300,6 +2993,33 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2390,6 +3110,33 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2480,6 +3227,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2570,6 +3344,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2646,6 +3447,26 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") unordered, align 4
@@ -2720,6 +3541,26 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") monotonic, align 4
@@ -2795,6 +3636,27 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") acquire, align 4
@@ -2871,6 +3733,27 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -2933,6 +3816,26 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") unordered, align 4
@@ -2994,6 +3897,26 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") monotonic, align 4
@@ -3057,6 +3980,27 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") release, align 4
@@ -3120,6 +4064,27 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -3181,6 +4146,26 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -3244,6 +4229,28 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -3307,6 +4314,27 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") release
@@ -3372,6 +4400,29 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -3437,6 +4488,29 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -3509,6 +4583,31 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -3584,6 +4683,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -3659,6 +4784,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -3728,6 +4879,26 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3799,6 +4970,28 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3870,6 +5063,27 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3943,6 +5157,29 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4016,6 +5253,29 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4087,6 +5347,28 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4160,6 +5442,29 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4233,6 +5538,29 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4306,6 +5634,29 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4379,6 +5730,29 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4461,6 +5835,31 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4547,6 +5946,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4633,6 +6058,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4717,6 +6168,31 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4803,6 +6279,32 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4889,6 +6391,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4975,6 +6503,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5061,6 +6615,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir
index 9fe27da1dfce..5804667ad8d3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir
@@ -11,10 +11,10 @@ body:             |
 
     $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3
     $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $sgpr2_sgpr3, implicit $exec
-    renamable $vgpr2 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(42)* undef`)
+    renamable $vgpr2 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(42)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -30,7 +30,7 @@ body:             |
     $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(42)* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(42)* undef`)
     S_ENDPGM 0
 
 ...
@@ -47,7 +47,7 @@ body:             |
     $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec, implicit $exec
     $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $exec
     $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_ATOMIC_CMPSWAP killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("workgroup-one-as") seq_cst seq_cst 4 on `i32 addrspace(42)* undef`)
+    FLAT_ATOMIC_CMPSWAP killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("workgroup-one-as") seq_cst seq_cst 4 on `i32 addrspace(42)* undef`)
     S_ENDPGM 0
 
 ...
@@ -63,7 +63,7 @@ body:             |
     $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3
     $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $sgpr2_sgpr3, implicit $exec
     $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    FLAT_ATOMIC_SWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("wavefront-one-as") seq_cst 4 on `i32 addrspace(42)* undef`)
+    FLAT_ATOMIC_SWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("wavefront-one-as") seq_cst 4 on `i32 addrspace(42)* undef`)
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index db2705704c9d..bf0ce9d6b100 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
 
 define amdgpu_kernel void @local_agent_unordered_load(
 ; GFX6-LABEL: local_agent_unordered_load:
@@ -64,6 +66,28 @@ define amdgpu_kernel void @local_agent_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") unordered, align 4
@@ -130,6 +154,28 @@ define amdgpu_kernel void @local_agent_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") monotonic, align 4
@@ -197,6 +243,29 @@ define amdgpu_kernel void @local_agent_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") acquire, align 4
@@ -270,6 +339,31 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") seq_cst, align 4
@@ -326,6 +420,24 @@ define amdgpu_kernel void @local_agent_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") unordered, align 4
@@ -381,6 +493,24 @@ define amdgpu_kernel void @local_agent_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") monotonic, align 4
@@ -442,6 +572,26 @@ define amdgpu_kernel void @local_agent_release_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") release, align 4
@@ -503,6 +653,26 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") seq_cst, align 4
@@ -558,6 +728,24 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") monotonic
@@ -619,6 +807,26 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acquire
@@ -680,6 +888,26 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") release
@@ -747,6 +975,28 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acq_rel
@@ -814,6 +1064,28 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") seq_cst
@@ -880,6 +1152,29 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acquire
@@ -953,6 +1248,31 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acq_rel
@@ -1026,6 +1346,31 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") seq_cst
@@ -1088,6 +1433,26 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1156,6 +1521,28 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1224,6 +1611,28 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1298,6 +1707,30 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1372,6 +1805,30 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1440,6 +1897,28 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1514,6 +1993,30 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1588,6 +2091,30 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1662,6 +2189,30 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1736,6 +2287,30 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1809,6 +2384,31 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1890,6 +2490,33 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1971,6 +2598,33 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2046,6 +2700,31 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2127,6 +2806,33 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2208,6 +2914,33 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2289,6 +3022,33 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2370,6 +3130,33 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2438,6 +3225,28 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") unordered, align 4
@@ -2504,6 +3313,28 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") monotonic, align 4
@@ -2570,6 +3401,28 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") acquire, align 4
@@ -2636,6 +3489,28 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") seq_cst, align 4
@@ -2692,6 +3567,24 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") unordered, align 4
@@ -2747,6 +3640,24 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") monotonic, align 4
@@ -2802,6 +3713,24 @@ define amdgpu_kernel void @local_agent_one_as_release_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") release, align 4
@@ -2857,6 +3786,24 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") seq_cst, align 4
@@ -2912,6 +3859,24 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") monotonic
@@ -2967,6 +3932,24 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acquire
@@ -3022,6 +4005,24 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") release
@@ -3077,6 +4078,24 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -3132,6 +4151,24 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -3197,6 +4234,28 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acquire
@@ -3263,6 +4322,28 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -3329,6 +4410,28 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -3391,6 +4494,26 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3453,6 +4576,26 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3515,6 +4658,26 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3577,6 +4740,26 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3639,6 +4822,26 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3701,6 +4904,26 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3763,6 +4986,26 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3825,6 +5068,26 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3887,6 +5150,26 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3949,6 +5232,26 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4021,6 +5324,30 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4095,6 +5422,30 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4169,6 +5520,30 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4243,6 +5618,30 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4317,6 +5716,30 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4391,6 +5814,30 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4465,6 +5912,30 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4539,6 +6010,30 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index 9885036a78c8..12eef07032da 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
 
 define amdgpu_kernel void @local_nontemporal_load_0(
 ; GFX6-LABEL: local_nontemporal_load_0:
@@ -73,6 +75,32 @@ define amdgpu_kernel void @local_nontemporal_load_0(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_0:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_0:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0
@@ -151,6 +179,32 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_1:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_1:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -225,6 +279,32 @@ define amdgpu_kernel void @local_nontemporal_store_0(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_0:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_0:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load i32, i32 addrspace(1)* %in, align 4
@@ -300,6 +380,32 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_1:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_1:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
index 4e521c3dce8b..482a31f08724 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
 
 define amdgpu_kernel void @local_singlethread_unordered_load(
 ; GFX6-LABEL: local_singlethread_unordered_load:
@@ -64,6 +66,30 @@ define amdgpu_kernel void @local_singlethread_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") unordered, align 4
@@ -130,6 +156,30 @@ define amdgpu_kernel void @local_singlethread_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") monotonic, align 4
@@ -196,6 +246,30 @@ define amdgpu_kernel void @local_singlethread_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") acquire, align 4
@@ -262,6 +336,30 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") seq_cst, align 4
@@ -318,6 +416,26 @@ define amdgpu_kernel void @local_singlethread_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") unordered, align 4
@@ -373,6 +491,26 @@ define amdgpu_kernel void @local_singlethread_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") monotonic, align 4
@@ -428,6 +566,26 @@ define amdgpu_kernel void @local_singlethread_release_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") release, align 4
@@ -483,6 +641,26 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") seq_cst, align 4
@@ -538,6 +716,26 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") monotonic
@@ -593,6 +791,26 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acquire
@@ -648,6 +866,26 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") release
@@ -703,6 +941,26 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acq_rel
@@ -758,6 +1016,26 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") seq_cst
@@ -823,6 +1101,30 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acquire
@@ -889,6 +1191,30 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acq_rel
@@ -955,6 +1281,30 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") seq_cst
@@ -1017,6 +1367,28 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1079,6 +1451,28 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1141,6 +1535,28 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1203,6 +1619,28 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1265,6 +1703,28 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1327,6 +1787,28 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1389,6 +1871,28 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1451,6 +1955,28 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1513,6 +2039,28 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1575,6 +2123,28 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1647,6 +2217,32 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1721,6 +2317,32 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1795,6 +2417,32 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1869,6 +2517,32 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1943,6 +2617,32 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2017,6 +2717,32 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2091,6 +2817,32 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2165,6 +2917,32 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2233,6 +3011,30 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") unordered, align 4
@@ -2299,6 +3101,30 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") monotonic, align 4
@@ -2365,6 +3191,30 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") acquire, align 4
@@ -2431,6 +3281,30 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") seq_cst, align 4
@@ -2487,6 +3361,26 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") unordered, align 4
@@ -2542,6 +3436,26 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") monotonic, align 4
@@ -2597,6 +3511,26 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") release, align 4
@@ -2652,6 +3586,26 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") seq_cst, align 4
@@ -2707,6 +3661,26 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") monotonic
@@ -2762,6 +3736,26 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -2817,6 +3811,26 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") release
@@ -2872,6 +3886,26 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -2927,6 +3961,26 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -2992,6 +4046,30 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -3058,6 +4136,30 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -3124,6 +4226,30 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -3186,6 +4312,28 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3248,6 +4396,28 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3310,6 +4480,28 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3372,6 +4564,28 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3434,6 +4648,28 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3496,6 +4732,28 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3558,6 +4816,28 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3620,6 +4900,28 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3682,6 +4984,28 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3744,6 +5068,28 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3816,6 +5162,32 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3890,6 +5262,32 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3964,6 +5362,32 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4038,6 +5462,32 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4112,6 +5562,32 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4186,6 +5662,32 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4260,6 +5762,32 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4334,6 +5862,32 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index 9df4ca45c682..76cc1a5d5b9f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
 
 define amdgpu_kernel void @local_system_unordered_load(
 ; GFX6-LABEL: local_system_unordered_load:
@@ -64,6 +66,28 @@ define amdgpu_kernel void @local_system_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in unordered, align 4
@@ -130,6 +154,28 @@ define amdgpu_kernel void @local_system_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in monotonic, align 4
@@ -197,6 +243,29 @@ define amdgpu_kernel void @local_system_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in acquire, align 4
@@ -270,6 +339,31 @@ define amdgpu_kernel void @local_system_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in seq_cst, align 4
@@ -326,6 +420,24 @@ define amdgpu_kernel void @local_system_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out unordered, align 4
@@ -381,6 +493,24 @@ define amdgpu_kernel void @local_system_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out monotonic, align 4
@@ -442,6 +572,26 @@ define amdgpu_kernel void @local_system_release_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out release, align 4
@@ -503,6 +653,26 @@ define amdgpu_kernel void @local_system_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out seq_cst, align 4
@@ -558,6 +728,24 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in monotonic
@@ -619,6 +807,26 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acquire
@@ -680,6 +888,26 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in release
@@ -747,6 +975,28 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acq_rel
@@ -814,6 +1064,28 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in seq_cst
@@ -880,6 +1152,29 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acquire
@@ -953,6 +1248,31 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acq_rel
@@ -1026,6 +1346,31 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in seq_cst
@@ -1088,6 +1433,26 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1156,6 +1521,28 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1224,6 +1611,28 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1298,6 +1707,30 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1372,6 +1805,30 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1440,6 +1897,28 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1514,6 +1993,30 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1588,6 +2091,30 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1662,6 +2189,30 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1736,6 +2287,30 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1809,6 +2384,31 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1890,6 +2490,33 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1971,6 +2598,33 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2046,6 +2700,31 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2127,6 +2806,33 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2208,6 +2914,33 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2289,6 +3022,33 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2370,6 +3130,33 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2438,6 +3225,28 @@ define amdgpu_kernel void @local_system_one_as_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") unordered, align 4
@@ -2504,6 +3313,28 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") monotonic, align 4
@@ -2570,6 +3401,28 @@ define amdgpu_kernel void @local_system_one_as_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") acquire, align 4
@@ -2636,6 +3489,28 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") seq_cst, align 4
@@ -2692,6 +3567,24 @@ define amdgpu_kernel void @local_system_one_as_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") unordered, align 4
@@ -2747,6 +3640,24 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") monotonic, align 4
@@ -2802,6 +3713,24 @@ define amdgpu_kernel void @local_system_one_as_release_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") release, align 4
@@ -2857,6 +3786,24 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") seq_cst, align 4
@@ -2912,6 +3859,24 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") monotonic
@@ -2967,6 +3932,24 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acquire
@@ -3022,6 +4005,24 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") release
@@ -3077,6 +4078,24 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acq_rel
@@ -3132,6 +4151,24 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") seq_cst
@@ -3197,6 +4234,28 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acquire
@@ -3263,6 +4322,28 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acq_rel
@@ -3329,6 +4410,28 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") seq_cst
@@ -3391,6 +4494,26 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3453,6 +4576,26 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3515,6 +4658,26 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3577,6 +4740,26 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3639,6 +4822,26 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3701,6 +4904,26 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3763,6 +4986,26 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3825,6 +5068,26 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3887,6 +5150,26 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3949,6 +5232,26 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4021,6 +5324,30 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4095,6 +5422,30 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4169,6 +5520,30 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4243,6 +5618,30 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4317,6 +5716,30 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4391,6 +5814,30 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4465,6 +5912,30 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4539,6 +6010,30 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
index cb5265b8f085..649d384d41a4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
 
 define amdgpu_kernel void @local_wavefront_unordered_load(
 ; GFX6-LABEL: local_wavefront_unordered_load:
@@ -64,6 +66,30 @@ define amdgpu_kernel void @local_wavefront_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") unordered, align 4
@@ -130,6 +156,30 @@ define amdgpu_kernel void @local_wavefront_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") monotonic, align 4
@@ -196,6 +246,30 @@ define amdgpu_kernel void @local_wavefront_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") acquire, align 4
@@ -262,6 +336,30 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") seq_cst, align 4
@@ -318,6 +416,26 @@ define amdgpu_kernel void @local_wavefront_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") unordered, align 4
@@ -373,6 +491,26 @@ define amdgpu_kernel void @local_wavefront_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") monotonic, align 4
@@ -428,6 +566,26 @@ define amdgpu_kernel void @local_wavefront_release_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") release, align 4
@@ -483,6 +641,26 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") seq_cst, align 4
@@ -538,6 +716,26 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") monotonic
@@ -593,6 +791,26 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acquire
@@ -648,6 +866,26 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") release
@@ -703,6 +941,26 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acq_rel
@@ -758,6 +1016,26 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") seq_cst
@@ -823,6 +1101,30 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acquire
@@ -889,6 +1191,30 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acq_rel
@@ -955,6 +1281,30 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") seq_cst
@@ -1017,6 +1367,28 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1079,6 +1451,28 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1141,6 +1535,28 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1203,6 +1619,28 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1265,6 +1703,28 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1327,6 +1787,28 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1389,6 +1871,28 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1451,6 +1955,28 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1513,6 +2039,28 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1575,6 +2123,28 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1647,6 +2217,32 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1721,6 +2317,32 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1795,6 +2417,32 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1869,6 +2517,32 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1943,6 +2617,32 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2017,6 +2717,32 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2091,6 +2817,32 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2165,6 +2917,32 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2233,6 +3011,30 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") unordered, align 4
@@ -2299,6 +3101,30 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") monotonic, align 4
@@ -2365,6 +3191,30 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") acquire, align 4
@@ -2431,6 +3281,30 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") seq_cst, align 4
@@ -2487,6 +3361,26 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") unordered, align 4
@@ -2542,6 +3436,26 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") monotonic, align 4
@@ -2597,6 +3511,26 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") release, align 4
@@ -2652,6 +3586,26 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") seq_cst, align 4
@@ -2707,6 +3661,26 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") monotonic
@@ -2762,6 +3736,26 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -2817,6 +3811,26 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") release
@@ -2872,6 +3886,26 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -2927,6 +3961,26 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -2992,6 +4046,30 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -3058,6 +4136,30 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -3124,6 +4226,30 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -3186,6 +4312,28 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3248,6 +4396,28 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3310,6 +4480,28 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3372,6 +4564,28 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3434,6 +4648,28 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3496,6 +4732,28 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3558,6 +4816,28 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3620,6 +4900,28 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3682,6 +4984,28 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3744,6 +5068,28 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3816,6 +5162,32 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3890,6 +5262,32 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3964,6 +5362,32 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4038,6 +5462,32 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4112,6 +5562,32 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4186,6 +5662,32 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4260,6 +5762,32 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4334,6 +5862,32 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index 6cd5fd5d9444..3af0c1e9ac52 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
 
 define amdgpu_kernel void @local_workgroup_unordered_load(
 ; GFX6-LABEL: local_workgroup_unordered_load:
@@ -64,6 +66,28 @@ define amdgpu_kernel void @local_workgroup_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") unordered, align 4
@@ -130,6 +154,28 @@ define amdgpu_kernel void @local_workgroup_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") monotonic, align 4
@@ -197,6 +243,29 @@ define amdgpu_kernel void @local_workgroup_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") acquire, align 4
@@ -270,6 +339,31 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") seq_cst, align 4
@@ -326,6 +420,24 @@ define amdgpu_kernel void @local_workgroup_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") unordered, align 4
@@ -381,6 +493,24 @@ define amdgpu_kernel void @local_workgroup_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") monotonic, align 4
@@ -442,6 +572,26 @@ define amdgpu_kernel void @local_workgroup_release_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") release, align 4
@@ -503,6 +653,26 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") seq_cst, align 4
@@ -558,6 +728,24 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") monotonic
@@ -619,6 +807,26 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acquire
@@ -680,6 +888,26 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") release
@@ -747,6 +975,28 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acq_rel
@@ -814,6 +1064,28 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") seq_cst
@@ -880,6 +1152,29 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acquire
@@ -953,6 +1248,31 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acq_rel
@@ -1026,6 +1346,31 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") seq_cst
@@ -1088,6 +1433,26 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1156,6 +1521,28 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1224,6 +1611,28 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1298,6 +1707,30 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1372,6 +1805,30 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1440,6 +1897,28 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1514,6 +1993,30 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1588,6 +2091,30 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1662,6 +2189,30 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1736,6 +2287,30 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1809,6 +2384,31 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1890,6 +2490,33 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1971,6 +2598,33 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2046,6 +2700,31 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2127,6 +2806,33 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2208,6 +2914,33 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2289,6 +3022,33 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2370,6 +3130,33 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2438,6 +3225,28 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") unordered, align 4
@@ -2504,6 +3313,28 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") monotonic, align 4
@@ -2570,6 +3401,28 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") acquire, align 4
@@ -2636,6 +3489,28 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -2692,6 +3567,24 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") unordered, align 4
@@ -2747,6 +3640,24 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") monotonic, align 4
@@ -2802,6 +3713,24 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") release, align 4
@@ -2857,6 +3786,24 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 %in, i32 addrspace(3)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -2912,6 +3859,24 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -2967,6 +3932,24 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -3022,6 +4005,24 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") release
@@ -3077,6 +4078,24 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -3132,6 +4151,24 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -3197,6 +4234,28 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -3263,6 +4322,28 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -3329,6 +4410,28 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -3391,6 +4494,26 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3453,6 +4576,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3515,6 +4658,26 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3577,6 +4740,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3639,6 +4822,26 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3701,6 +4904,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3763,6 +4986,26 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3825,6 +5068,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3887,6 +5150,26 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3949,6 +5232,26 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4021,6 +5324,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4095,6 +5422,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4169,6 +5520,30 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4243,6 +5618,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4317,6 +5716,30 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4391,6 +5814,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4465,6 +5912,30 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4539,6 +6010,30 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
     i32 addrspace(3)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir
index 973a15e039af..34f5306c0838 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir
@@ -20,7 +20,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -44,7 +44,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -68,7 +68,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -92,7 +92,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -116,7 +116,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -140,7 +140,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -164,7 +164,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -188,7 +188,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -212,7 +212,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -236,7 +236,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -260,7 +260,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -284,7 +284,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -308,7 +308,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -332,7 +332,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -356,7 +356,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -380,7 +380,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -404,7 +404,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -428,7 +428,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -452,7 +452,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -476,7 +476,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir
index a9545e664158..0aec90f86b9d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir
@@ -23,13 +23,13 @@ body:             |
     $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
     $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
     $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`)
     S_WAITCNT 127
     S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc
     S_WAITCNT 3855
     $vgpr0 = V_MOV_B32_e32 2, implicit $exec
     $vgpr1 = V_MOV_B32_e32 32772, implicit $exec
-    BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`)
+    BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`)
     S_CBRANCH_SCC0 %bb.1, implicit killed $scc
 
   bb.2:
@@ -55,11 +55,11 @@ body:             |
     S_WAITCNT 127
     $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc
     $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent-one-as") unordered 4 from `i32 addrspace(1)* undef`), (load syncscope("workgroup-one-as") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`)
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent-one-as") unordered 4 from `i32 addrspace(1)* undef`), (load syncscope("workgroup-one-as") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`)
     $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5
     $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec
     S_WAITCNT 3952
-    FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32 addrspace(1)* undef`)
+    FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32 addrspace(1)* undef`)
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir
index 6a037a77784e..f228fed9b2b2 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir
@@ -117,13 +117,13 @@ body:             |
     $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
     $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
     $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01)
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01)
     S_WAITCNT 127
     S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc
     S_WAITCNT 3855
     $vgpr0 = V_MOV_B32_e32 2, implicit $exec
     $vgpr1 = V_MOV_B32_e32 32772, implicit $exec
-    BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12)
+    BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12)
     S_CBRANCH_SCC0 %bb.1.if, implicit killed $scc
 
   bb.2.else:
@@ -149,11 +149,11 @@ body:             |
     S_WAITCNT 127
     $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc
     $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (non-temporal load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr)
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (non-temporal load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr)
     $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5
     $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec
     S_WAITCNT 3952
-    FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.out)
+    FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.out)
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir
index 0dfa137999f3..2cd298a86914 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir
@@ -42,7 +42,7 @@
 # CHECK-LABEL: name: multiple_mem_operands
 
 # CHECK-LABEL: bb.3.done:
-# CHECK: BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0
+# CHECK: BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0
 
 name:            multiple_mem_operands
 alignment:       1
@@ -97,13 +97,13 @@ body:             |
     $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
     $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
     $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01)
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01)
     S_WAITCNT 127
     S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc
     S_WAITCNT 3855
     $vgpr0 = V_MOV_B32_e32 2, implicit $exec
     $vgpr1 = V_MOV_B32_e32 32772, implicit $exec
-    BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12)
+    BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12)
     S_CBRANCH_SCC0 %bb.1.if, implicit killed $scc
 
   bb.2.else:
@@ -129,11 +129,11 @@ body:             |
     S_WAITCNT 127
     $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc
     $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr)
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr)
     $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5
     $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec
     S_WAITCNT 3952
-    FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.out)
+    FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.out)
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 03fb7d3eba4b..a1a6b64391b0 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
 
 define amdgpu_kernel void @private_nontemporal_load_0(
 ; GFX6-LABEL: private_nontemporal_load_0:
@@ -91,6 +93,40 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s8, s8, s7
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s9, s9, 0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen glc slc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s8, s8, s7
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s9, s9, 0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen glc slc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load i32, i32 addrspace(5)* %in, align 4, !nontemporal !0
@@ -187,6 +223,40 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s8, s8, s7
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s9, s9, 0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s8, s8, s7
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s9, s9, 0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -280,6 +350,40 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s8, s8, s7
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s9, s9, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s8, s8, s7
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s9, s9, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
 entry:
   %val = load i32, i32 addrspace(1)* %in, align 4
@@ -374,6 +478,40 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v0, s[4:7], 0 offen glc slc
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s8, s8, s7
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s9, s9, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s8, s8, s7
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s9, s9, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+;
     i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir
index 3452d4148cb9..a2d530b2fbed 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir
@@ -20,7 +20,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 1, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -44,7 +44,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -68,7 +68,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -92,7 +92,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -116,7 +116,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -140,7 +140,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -164,7 +164,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -188,7 +188,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -212,7 +212,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -236,7 +236,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -260,7 +260,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -284,7 +284,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -308,7 +308,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -332,7 +332,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -356,7 +356,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -380,7 +380,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -404,7 +404,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -428,7 +428,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -452,7 +452,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...
@@ -476,7 +476,7 @@ body:             |
     renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/memory_clause.mir b/llvm/test/CodeGen/AMDGPU/memory_clause.mir
index 683b55ed84a2..5597a166c143 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.mir
@@ -2,12 +2,12 @@
 
 # GCN-LABEL: {{^}}name: vector_clause{{$}}
 # GCN:         early-clobber %2:vreg_128, early-clobber %4:vreg_128, early-clobber %1:vreg_128, early-clobber %3:vreg_128 = BUNDLE %0, implicit $exec {
-# GCN-NEXT:    %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec
-# GCN-NEXT:    %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec
-# GCN-NEXT:    %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec
-# GCN-NEXT:    %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec
+# GCN-NEXT:    %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:    %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:    %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:    %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, 0, implicit $exec
 # GCN-NEXT:  }
-# GCN-NEXT:  GLOBAL_STORE_DWORDX4 %0, %1, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:  GLOBAL_STORE_DWORDX4 %0, %1, 0, 0, 0, 0, 0, implicit $exec
 
 ---
 name:            vector_clause
@@ -21,24 +21,24 @@ registers:
 body:             |
   bb.0:
     %0 = IMPLICIT_DEF
-    %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec
-    %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec
-    %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec
-    %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %0, %1, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %0, %2, 16, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %0, %3, 32, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %0, %4, 48, 0, 0, 0, implicit $exec
+    %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec
+    %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, 0, implicit $exec
+    %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, 0, implicit $exec
+    %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %1, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2, 16, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %3, 32, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %4, 48, 0, 0, 0, 0, implicit $exec
 ...
 
 # GCN-LABEL: {{^}}name: subreg_full{{$}}
 # GCN:      early-clobber %1:vreg_128 = BUNDLE %0, implicit $exec {
-# GCN-NEXT:   undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, implicit $exec
-# GCN-NEXT:   internal %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, implicit $exec
-# GCN-NEXT:   internal %1.sub2:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, implicit $exec
-# GCN-NEXT:   internal %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, implicit $exec
+# GCN-NEXT:   undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   internal %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   internal %1.sub2:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   internal %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, 0, implicit $exec
 # GCN-NEXT: }
-# GCN-NEXT: GLOBAL_STORE_DWORDX4 %0.sub0_sub1, %1, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT: GLOBAL_STORE_DWORDX4 %0.sub0_sub1, %1, 0, 0, 0, 0, 0, implicit $exec
 
 ---
 name:            subreg_full
@@ -49,20 +49,20 @@ registers:
 body:             |
   bb.0:
     %0 = IMPLICIT_DEF
-    undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, implicit $exec
-    %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, implicit $exec
-    %1.sub2:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, implicit $exec
-    %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %0.sub0_sub1, %1, 0, 0, 0, 0, implicit $exec
+    undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, 0, implicit $exec
+    %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, 0, implicit $exec
+    %1.sub2:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, 0, implicit $exec
+    %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0.sub0_sub1, %1, 0, 0, 0, 0, 0, implicit $exec
 ...
 
 # GCN-LABEL: {{^}}name: subreg_part{{$}}
 # GCN:      undef early-clobber %1.sub0_sub1:vreg_128, undef early-clobber %1.sub3:vreg_128 = BUNDLE %0, implicit $exec {
-# GCN-NEXT:   undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, implicit $exec
-# GCN-NEXT:   internal %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, implicit $exec
-# GCN-NEXT:   internal %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, implicit $exec
+# GCN-NEXT:   undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   internal %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   internal %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, 0, implicit $exec
 # GCN-NEXT: }
-# GCN-NEXT: GLOBAL_STORE_DWORDX4 %0.sub0_sub1, %1, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT: GLOBAL_STORE_DWORDX4 %0.sub0_sub1, %1, 0, 0, 0, 0, 0, implicit $exec
 
 ---
 name:            subreg_part
@@ -73,18 +73,18 @@ registers:
 body:             |
   bb.0:
     %0 = IMPLICIT_DEF
-    undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, implicit $exec
-    %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, implicit $exec
-    %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %0.sub0_sub1, %1, 0, 0, 0, 0, implicit $exec
+    undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, 0, implicit $exec
+    %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, 0, implicit $exec
+    %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0.sub0_sub1, %1, 0, 0, 0, 0, 0, implicit $exec
 ...
 
 # GCN-LABEL: {{^}}name: dead{{$}}
 # GCN:      dead early-clobber %2:vreg_128, dead early-clobber %4:vreg_128, dead early-clobber %1:vreg_128, dead early-clobber %3:vreg_128 = BUNDLE %0, implicit $exec {
-# GCN-NEXT:   dead %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec
-# GCN-NEXT:   %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, 0, implicit $exec
 # GCN-NEXT: }
 
 ---
@@ -99,18 +99,18 @@ registers:
 body:             |
   bb.0:
     %0 = IMPLICIT_DEF
-    dead %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec
-    dead %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec
-    dead %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec
-    dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec
+    dead %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec
+    dead %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, 0, implicit $exec
+    dead %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, 0, implicit $exec
+    dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, 0, implicit $exec
 ...
 
 # GCN-LABEL: {{^}}name: subreg_dead{{$}}
 # GCN:      early-clobber %1:vreg_64 = BUNDLE %0, implicit $exec {
-# GCN-NEXT:    %1.sub0:vreg_64 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, implicit $exec
-# GCN-NEXT:    dead %1.sub1:vreg_64 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, implicit $exec
+# GCN-NEXT:    %1.sub0:vreg_64 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:    dead %1.sub1:vreg_64 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, 0, implicit $exec
 # GCN-NEXT: }
-# GCN-NEXT: GLOBAL_STORE_DWORD %0, %1.sub0, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT: GLOBAL_STORE_DWORD %0, %1.sub0, 0, 0, 0, 0, 0, implicit $exec
 
 ---
 name:            subreg_dead
@@ -121,15 +121,15 @@ registers:
 body:             |
   bb.0:
     %0 = IMPLICIT_DEF
-    undef %1.sub0:vreg_64 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, implicit $exec
-    dead %1.sub1:vreg_64 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD %0, %1.sub0, 0, 0, 0, 0, implicit $exec
+    undef %1.sub0:vreg_64 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, 0, implicit $exec
+    dead %1.sub1:vreg_64 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, %1.sub0, 0, 0, 0, 0, 0, implicit $exec
 ...
 
 # GCN-LABEL: {{^}}name: kill{{$}}
 # GCN:      early-clobber %2:vreg_128, early-clobber %3:vreg_128 = BUNDLE %0, %1, implicit $exec {
-# GCN-NEXT:   %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec
-# GCN-NEXT:   %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 16, 0, 0, 0, implicit $exec
+# GCN-NEXT:   %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 16, 0, 0, 0, 0, implicit $exec
 # GCN-NEXT: }
 
 ---
@@ -144,17 +144,17 @@ body:             |
   bb.0:
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
-    %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec
-    %3:vreg_128 = GLOBAL_LOAD_DWORDX4 killed %1, 16, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %0, %2, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %0, %3, 16, 0, 0, 0, implicit $exec
+    %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec
+    %3:vreg_128 = GLOBAL_LOAD_DWORDX4 killed %1, 16, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %3, 16, 0, 0, 0, 0, implicit $exec
 ...
 
 # GCN-LABEL: {{^}}name: indirect{{$}}
-# GCN:      %1:vreg_64 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, 0, 0, implicit $exec
+# GCN:      %1:vreg_64 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, 0, 0, 0, implicit $exec
 # GCN-NEXT:   early-clobber %2:vreg_128, early-clobber %3:vreg_128 = BUNDLE %1, implicit $exec {
-# GCN-NEXT:   %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 0, 0, 0, 0, implicit $exec
-# GCN-NEXT:   %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 16, 0, 0, 0, implicit $exec
+# GCN-NEXT:   %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 0, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 16, 0, 0, 0, 0, implicit $exec
 # GCN-NEXT: }
 
 ---
@@ -168,18 +168,18 @@ registers:
 body:             |
   bb.0:
     %0 = IMPLICIT_DEF
-    %1:vreg_64 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, 0, 0, implicit $exec
-    %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 0, 0, 0, 0, implicit $exec
-    %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 16, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %0, %2, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %0, %3, 16, 0, 0, 0, implicit $exec
+    %1:vreg_64 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, 0, 0, 0, implicit $exec
+    %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 0, 0, 0, 0, 0, implicit $exec
+    %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 16, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %3, 16, 0, 0, 0, 0, implicit $exec
 ...
 
 # GCN-LABEL: {{^}}name: stack{{$}}
 # GCN:      %0:vreg_64 = IMPLICIT_DEF
-# GCN-NEXT: %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %stack.0, 0, 0, 0, 0, implicit $exec
-# GCN-NEXT: %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %stack.0, 16, 0, 0, 0, implicit $exec
-# GCN-NEXT: GLOBAL_STORE_DWORDX4 %0, %1, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT: %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %stack.0, 0, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT: %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %stack.0, 16, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT: GLOBAL_STORE_DWORDX4 %0, %1, 0, 0, 0, 0, 0, implicit $exec
 
 ---
 name:            stack
@@ -193,33 +193,33 @@ stack:
 body:             |
   bb.0:
     %0 = IMPLICIT_DEF
-    %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %stack.0, 0, 0, 0, 0, implicit $exec
-    %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %stack.0, 16, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %0, %1, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %0, %2, 16, 0, 0, 0, implicit $exec
+    %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %stack.0, 0, 0, 0, 0, 0, implicit $exec
+    %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %stack.0, 16, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %1, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2, 16, 0, 0, 0, 0, implicit $exec
 ...
 
 # GCN-LABEL: {{^}}name: overflow_counter{{$}}
 # GCN:      dead early-clobber %7:vgpr_32, dead early-clobber %14:vgpr_32, dead early-clobber %2:vgpr_32, dead early-clobber %9:vgpr_32, dead early-clobber %4:vgpr_32, dead early-clobber %11:vgpr_32, dead early-clobber %6:vgpr_32, dead early-clobber %13:vgpr_32, dead early-clobber %1:vgpr_32, dead early-clobber %8:vgpr_32, dead early-clobber %15:vgpr_32, dead early-clobber %3:vgpr_32, dead early-clobber %10:vgpr_32, dead early-clobber %5:vgpr_32, dead early-clobber %12:vgpr_32 = BUNDLE %0, implicit $exec {
-# GCN-NEXT:   dead %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %6:vgpr_32 = GLOBAL_LOAD_DWORD %0, 20, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %7:vgpr_32 = GLOBAL_LOAD_DWORD %0, 24, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %8:vgpr_32 = GLOBAL_LOAD_DWORD %0, 28, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %9:vgpr_32 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %10:vgpr_32 = GLOBAL_LOAD_DWORD %0, 36, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %11:vgpr_32 = GLOBAL_LOAD_DWORD %0, 40, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %12:vgpr_32 = GLOBAL_LOAD_DWORD %0, 44, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %13:vgpr_32 = GLOBAL_LOAD_DWORD %0, 48, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %14:vgpr_32 = GLOBAL_LOAD_DWORD %0, 52, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %15:vgpr_32 = GLOBAL_LOAD_DWORD %0, 56, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %6:vgpr_32 = GLOBAL_LOAD_DWORD %0, 20, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %7:vgpr_32 = GLOBAL_LOAD_DWORD %0, 24, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %8:vgpr_32 = GLOBAL_LOAD_DWORD %0, 28, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %9:vgpr_32 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %10:vgpr_32 = GLOBAL_LOAD_DWORD %0, 36, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %11:vgpr_32 = GLOBAL_LOAD_DWORD %0, 40, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %12:vgpr_32 = GLOBAL_LOAD_DWORD %0, 44, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %13:vgpr_32 = GLOBAL_LOAD_DWORD %0, 48, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %14:vgpr_32 = GLOBAL_LOAD_DWORD %0, 52, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %15:vgpr_32 = GLOBAL_LOAD_DWORD %0, 56, 0, 0, 0, 0, implicit $exec
 # GCN-NEXT: }
 # GCN-NEXT: dead early-clobber %16:vgpr_32, dead early-clobber %17:vgpr_32 = BUNDLE %0, implicit $exec {
-# GCN-NEXT:   dead %16:vgpr_32 = GLOBAL_LOAD_DWORD %0, 60, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %17:vgpr_32 = GLOBAL_LOAD_DWORD %0, 64, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %16:vgpr_32 = GLOBAL_LOAD_DWORD %0, 60, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %17:vgpr_32 = GLOBAL_LOAD_DWORD %0, 64, 0, 0, 0, 0, implicit $exec
 # GCN-NEXT: }
 
 ---
@@ -247,36 +247,36 @@ registers:
 body:             |
   bb.0:
     %0 = IMPLICIT_DEF
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, 0, 0, implicit $exec
-    %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, 0, 0, implicit $exec
-    %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, implicit $exec
-    %6:vgpr_32 = GLOBAL_LOAD_DWORD %0, 20, 0, 0, 0, implicit $exec
-    %7:vgpr_32 = GLOBAL_LOAD_DWORD %0, 24, 0, 0, 0, implicit $exec
-    %8:vgpr_32 = GLOBAL_LOAD_DWORD %0, 28, 0, 0, 0, implicit $exec
-    %9:vgpr_32 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, implicit $exec
-    %10:vgpr_32 = GLOBAL_LOAD_DWORD %0, 36, 0, 0, 0, implicit $exec
-    %11:vgpr_32 = GLOBAL_LOAD_DWORD %0, 40, 0, 0, 0, implicit $exec
-    %12:vgpr_32 = GLOBAL_LOAD_DWORD %0, 44, 0, 0, 0, implicit $exec
-    %13:vgpr_32 = GLOBAL_LOAD_DWORD %0, 48, 0, 0, 0, implicit $exec
-    %14:vgpr_32 = GLOBAL_LOAD_DWORD %0, 52, 0, 0, 0, implicit $exec
-    %15:vgpr_32 = GLOBAL_LOAD_DWORD %0, 56, 0, 0, 0, implicit $exec
-    %16:vgpr_32 = GLOBAL_LOAD_DWORD %0, 60, 0, 0, 0, implicit $exec
-    %17:vgpr_32 = GLOBAL_LOAD_DWORD %0, 64, 0, 0, 0, implicit $exec
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, 0, implicit $exec
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, 0, 0, 0, implicit $exec
+    %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, 0, 0, 0, implicit $exec
+    %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, 0, implicit $exec
+    %6:vgpr_32 = GLOBAL_LOAD_DWORD %0, 20, 0, 0, 0, 0, implicit $exec
+    %7:vgpr_32 = GLOBAL_LOAD_DWORD %0, 24, 0, 0, 0, 0, implicit $exec
+    %8:vgpr_32 = GLOBAL_LOAD_DWORD %0, 28, 0, 0, 0, 0, implicit $exec
+    %9:vgpr_32 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, 0, implicit $exec
+    %10:vgpr_32 = GLOBAL_LOAD_DWORD %0, 36, 0, 0, 0, 0, implicit $exec
+    %11:vgpr_32 = GLOBAL_LOAD_DWORD %0, 40, 0, 0, 0, 0, implicit $exec
+    %12:vgpr_32 = GLOBAL_LOAD_DWORD %0, 44, 0, 0, 0, 0, implicit $exec
+    %13:vgpr_32 = GLOBAL_LOAD_DWORD %0, 48, 0, 0, 0, 0, implicit $exec
+    %14:vgpr_32 = GLOBAL_LOAD_DWORD %0, 52, 0, 0, 0, 0, implicit $exec
+    %15:vgpr_32 = GLOBAL_LOAD_DWORD %0, 56, 0, 0, 0, 0, implicit $exec
+    %16:vgpr_32 = GLOBAL_LOAD_DWORD %0, 60, 0, 0, 0, 0, implicit $exec
+    %17:vgpr_32 = GLOBAL_LOAD_DWORD %0, 64, 0, 0, 0, 0, implicit $exec
 ...
 
 # GCN-LABEL: {{^}}name: reg_pressure{{$}}
 # GCN:      dead early-clobber %2:vreg_128, dead early-clobber %4:vreg_128, dead early-clobber %1:vreg_128, dead early-clobber %3:vreg_128, dead early-clobber %5:vreg_128 = BUNDLE %0, implicit $exec {
-# GCN-NEXT:   dead %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %5:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 64, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %5:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 64, 0, 0, 0, 0, implicit $exec
 # GCN-NEXT: }
 # GCN-NEXT: dead early-clobber %7:vreg_128, dead early-clobber %6:vreg_128 = BUNDLE %0, implicit $exec {
-# GCN-NEXT:   dead %6:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 80, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 96, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %6:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 80, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 96, 0, 0, 0, 0, implicit $exec
 # GCN-NEXT: }
 
 ---
@@ -294,22 +294,22 @@ registers:
 body:             |
   bb.0:
     %0 = IMPLICIT_DEF
-    %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec
-    %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec
-    %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec
-    %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec
-    %5:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 64, 0, 0, 0, implicit $exec
-    %6:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 80, 0, 0, 0, implicit $exec
-    %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 96, 0, 0, 0, implicit $exec
+    %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec
+    %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, 0, implicit $exec
+    %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, 0, implicit $exec
+    %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, 0, implicit $exec
+    %5:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 64, 0, 0, 0, 0, implicit $exec
+    %6:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 80, 0, 0, 0, 0, implicit $exec
+    %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 96, 0, 0, 0, 0, implicit $exec
 ...
 
 # GCN-LABEL: {{^}}name: image_clause{{$}}
 # GCN:      early-clobber %4:vreg_128, early-clobber %3:vreg_128, early-clobber %5:vreg_128 = BUNDLE %0, undef %2:sgpr_128, %1, implicit $exec {
-# GCN-NEXT:   %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-# GCN-NEXT:   %4:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-# GCN-NEXT:   %5:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   %4:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   %5:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 # GCN-NEXT: }
-# GCN-NEXT: IMAGE_STORE_V4_V2 %3, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT: IMAGE_STORE_V4_V2 %3, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
 ---
 name:            image_clause
@@ -325,19 +325,19 @@ body:             |
   bb.0:
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
-    %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
-    %4:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
-    %5:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
-    IMAGE_STORE_V4_V2 %3, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
-    IMAGE_STORE_V4_V2 %4, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
-    IMAGE_STORE_V4_V2 %5, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
+    %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
+    %4:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
+    %5:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
+    IMAGE_STORE_V4_V2 %3, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
+    IMAGE_STORE_V4_V2 %4, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
+    IMAGE_STORE_V4_V2 %5, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
 ...
 
 # GCN-LABEL: {{^}}name: mixed_clause{{$}}
 # GCN:      dead early-clobber %4:vreg_128, dead early-clobber %3:vreg_128, dead early-clobber %5:vgpr_32 = BUNDLE %0, %2, %1, implicit $exec {
-# GCN-NEXT:   dead %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, %2, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec
-# GCN-NEXT:   dead %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, %2, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec
+# GCN-NEXT:   dead %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 # GCN-NEXT: }
 
 ---
@@ -355,17 +355,17 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
     %2 = IMPLICIT_DEF
-    %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, %2, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
-    %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, %2, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
+    %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 ...
 
 # GCN-LABEL: {{^}}name: atomic{{$}}
 # GCN:      %1:vgpr_32 = IMPLICIT_DEF
-# GCN-NEXT: dead %2:vgpr_32 = FLAT_ATOMIC_ADD_RTN %0, %1, 0, -1, 0, implicit $exec, implicit $flat_scr
-# GCN-NEXT: dead %3:vgpr_32 = FLAT_ATOMIC_ADD_RTN %0, %1, 0, -1, 0, implicit $exec, implicit $flat_scr
-# GCN-NEXT: FLAT_ATOMIC_ADD %0, %1, 0, 0, implicit $exec, implicit $flat_scr
-# GCN-NEXT: FLAT_ATOMIC_ADD %0, %1, 0, 0, implicit $exec, implicit $flat_scr
+# GCN-NEXT: dead %2:vgpr_32 = FLAT_ATOMIC_ADD_RTN %0, %1, 0, -1, 0, 0, implicit $exec, implicit $flat_scr
+# GCN-NEXT: dead %3:vgpr_32 = FLAT_ATOMIC_ADD_RTN %0, %1, 0, -1, 0, 0, implicit $exec, implicit $flat_scr
+# GCN-NEXT: FLAT_ATOMIC_ADD %0, %1, 0, 0, 0, implicit $exec, implicit $flat_scr
+# GCN-NEXT: FLAT_ATOMIC_ADD %0, %1, 0, 0, 0, implicit $exec, implicit $flat_scr
 # GCN-NEXT: S_ENDPGM 0
 
 ---
@@ -380,9 +380,31 @@ body:             |
   bb.0:
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
-    %2:vgpr_32 = FLAT_ATOMIC_ADD_RTN %0, %1, 0, -1, 0, implicit $exec, implicit $flat_scr
-    %3:vgpr_32 = FLAT_ATOMIC_ADD_RTN %0, %1, 0, -1, 0, implicit $exec, implicit $flat_scr
-    FLAT_ATOMIC_ADD %0, %1, 0, 0, implicit $exec, implicit $flat_scr
-    FLAT_ATOMIC_ADD %0, %1, 0, 0, implicit $exec, implicit $flat_scr
+    %2:vgpr_32 = FLAT_ATOMIC_ADD_RTN %0, %1, 0, -1, 0, 0, implicit $exec, implicit $flat_scr
+    %3:vgpr_32 = FLAT_ATOMIC_ADD_RTN %0, %1, 0, -1, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_ATOMIC_ADD %0, %1, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_ATOMIC_ADD %0, %1, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
+
+# GCN-LABEL: {{^}}name: mem_clause_sreg256_used_stack{{$}}
+# GCN: dead undef early-clobber %0.sub7:sgpr_256, dead undef early-clobber %0.sub3:sgpr_256 = BUNDLE undef %1:sgpr_64(p4) {
+# GCN-NEXT:   undef %0.sub7:sgpr_256 = S_LOAD_DWORD_IMM undef %1:sgpr_64(p4), 8, 0, 0
+# GCN-NEXT:   internal dead %0.sub3:sgpr_256 = S_LOAD_DWORD_IMM undef %1:sgpr_64(p4), 24, 0, 0
+# GCN-NEXT: }
+---
+name:            mem_clause_sreg256_used_stack
+stack:
+  - { id: 0, type: default, offset: 0, size: 40, alignment: 8 }
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+
+    undef %0.sub7:sgpr_256 = S_LOAD_DWORD_IMM undef %1:sgpr_64(p4), 8, 0, 0
+    %0.sub3:sgpr_256 = S_LOAD_DWORD_IMM undef %1:sgpr_64(p4), 24, 0, 0
+    S_ENDPGM 0
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/merge-image-load-gfx10.mir b/llvm/test/CodeGen/AMDGPU/merge-image-load-gfx10.mir
index c7d297de04a2..4059e33fbd1c 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-image-load-gfx10.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-image-load-gfx10.mir
@@ -13,7 +13,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5:vreg_64, %3:sgpr_256, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5:vreg_64, %3:sgpr_256, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -31,7 +31,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5:vreg_64, %3:sgpr_256, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5:vreg_64, %3:sgpr_256, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -50,7 +50,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vreg_64 = IMAGE_LOAD_V2_V2_gfx10 %5:vreg_64, %3:sgpr_256, 3, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
     %7:vreg_64 = IMAGE_LOAD_V2_V2_gfx10 %5:vreg_64, %3:sgpr_256, 12, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
 ...
@@ -69,7 +69,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vreg_64 = IMAGE_LOAD_V2_V2_gfx10 %5:vreg_64, %3:sgpr_256, 12, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
     %7:vreg_64 = IMAGE_LOAD_V2_V2_gfx10 %5:vreg_64, %3:sgpr_256, 3, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
 ...
@@ -88,7 +88,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5:vreg_64, %3:sgpr_256, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
     %7:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5:vreg_64, %3:sgpr_256, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
 ...
@@ -107,7 +107,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5:vreg_64, %3:sgpr_256, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
     %7:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5:vreg_64, %3:sgpr_256, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
 ...
@@ -124,12 +124,12 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5:vreg_64, %3:sgpr_256, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %8:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %7:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %8:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %9:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %7:vreg_64, %3:sgpr_256, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
-    %10:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %10:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %11:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5:vreg_64, %3:sgpr_256, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
@@ -146,9 +146,9 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vreg_128 = COPY %2
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5:vreg_64, %3:sgpr_256, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    IMAGE_STORE_V4_V2 %4:vreg_128, %5:vreg_64, %3:sgpr_256, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
+    IMAGE_STORE_V4_V2 %4:vreg_128, %5:vreg_64, %3:sgpr_256, 15, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
     %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5:vreg_64, %3:sgpr_256, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
@@ -165,7 +165,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5:vreg_64, %3:sgpr_256, 4, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5:vreg_64, %3:sgpr_256, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -183,7 +183,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5:vreg_64, %3:sgpr_256, 4, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5:vreg_64, %3:sgpr_256, 11, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -201,8 +201,8 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %7:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %8:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %6, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -221,7 +221,7 @@ body:             |
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %5:vgpr_32 = COPY %2.sub3
-    %6:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %7:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %6, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %8:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %6, %4, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -239,7 +239,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5, %3, 7, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -257,7 +257,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5, %3, 7, 1, -1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -275,7 +275,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5, %3, 7, 1, -1, 0, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -293,7 +293,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5, %3, 7, 1, -1, 0, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -311,7 +311,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5, %3, 7, 1, -1, 0, 0, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -329,7 +329,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = COPY %5.sub0
     %7:vgpr_32 = IMAGE_LOAD_V1_V1_gfx10 %6, %3, 8, 1, -1, 0, 0, 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
     %8:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
@@ -348,7 +348,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vreg_64 = IMAGE_LOAD_V2_V2_gfx10 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -366,7 +366,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vreg_64 = IMAGE_LOAD_V2_V2_gfx10 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -384,7 +384,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -403,7 +403,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_LOAD_MIP_V1_V3_gfx10 %5:vreg_96, %3:sgpr_256, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_LOAD_MIP_V3_V3_gfx10 %5:vreg_96, %3:sgpr_256, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -424,7 +424,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_LOAD_MIP_PCK_V1_V3_gfx10 %5:vreg_96, %3:sgpr_256, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_LOAD_MIP_PCK_V3_V3_gfx10 %5:vreg_96, %3:sgpr_256, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -445,7 +445,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_LOAD_MIP_PCK_SGN_V1_V3_gfx10 %5:vreg_96, %3:sgpr_256, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_LOAD_MIP_PCK_SGN_V3_V3_gfx10 %5:vreg_96, %3:sgpr_256, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -464,7 +464,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_LOAD_PCK_V1_V2_gfx10 %5:vreg_64, %3:sgpr_256, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_LOAD_PCK_V3_V2_gfx10 %5:vreg_64, %3:sgpr_256, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -483,7 +483,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_LOAD_PCK_SGN_V1_V2_gfx10 %5:vreg_64, %3:sgpr_256, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_LOAD_PCK_SGN_V3_V2_gfx10 %5:vreg_64, %3:sgpr_256, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/merge-image-load.mir b/llvm/test/CodeGen/AMDGPU/merge-image-load.mir
index 020ccd497a11..02dc0d8d4b38 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-image-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-image-load.mir
@@ -1,7 +1,7 @@
 # RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s
 
 # GFX9-LABEL: name: image_load_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -13,13 +13,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
+
 # GFX9-LABEL: name: image_load_merged_v1v3_reversed
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub3
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub0_sub1_sub2
 
@@ -31,14 +32,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_load_merged_v2v2
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = COPY %8.sub0_sub1
 # GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %8.sub2_sub3
 
@@ -50,14 +51,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vreg_64 = IMAGE_LOAD_V2_V4 %5:vreg_128, %3:sgpr_256, 3, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
-    %7:vreg_64 = IMAGE_LOAD_V2_V4 %5:vreg_128, %3:sgpr_256, 12, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vreg_64 = IMAGE_LOAD_V2_V4 %5:vreg_128, %3:sgpr_256, 3, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
+    %7:vreg_64 = IMAGE_LOAD_V2_V4 %5:vreg_128, %3:sgpr_256, 12, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_load_merged_v2v2_reversed
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = COPY %8.sub2_sub3
 # GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %8.sub0_sub1
 
@@ -69,14 +70,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vreg_64 = IMAGE_LOAD_V2_V4 %5:vreg_128, %3:sgpr_256, 12, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
-    %7:vreg_64 = IMAGE_LOAD_V2_V4 %5:vreg_128, %3:sgpr_256, 3, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vreg_64 = IMAGE_LOAD_V2_V4 %5:vreg_128, %3:sgpr_256, 12, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
+    %7:vreg_64 = IMAGE_LOAD_V2_V4 %5:vreg_128, %3:sgpr_256, 3, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_load_merged_v3v1
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY %8.sub0_sub1_sub2
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %8.sub3
 
@@ -88,14 +89,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
-    %7:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %7:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_load_merged_v3v1_reversed
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY %8.sub1_sub2_sub3
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %8.sub0
 
@@ -107,14 +108,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
-    %7:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %7:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_load_divided_merged
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 
 name: image_load_divided_merged
 body:             |
@@ -124,19 +125,19 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %8:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %9:vreg_96 = IMAGE_LOAD_V3_V4 %7:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
-    %10:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %11:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %8:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %9:vreg_96 = IMAGE_LOAD_V3_V4 %7:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %10:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %11:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_load_divided_not_merged
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_load_divided_not_merged
 body:             |
@@ -146,16 +147,16 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vreg_128 = COPY %2
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    IMAGE_STORE_V4_V4 %4:vreg_128, %5:vreg_128, %3:sgpr_256, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
-    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    IMAGE_STORE_V4_V4 %4:vreg_128, %5:vreg_128, %3:sgpr_256, 15, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
+    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_load_dmask_overlapped_not_merged
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 4, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_load_dmask_overlapped_not_merged
 body:             |
@@ -165,15 +166,15 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 4, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_load_dmask_not_disjoint_not_merged
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 11, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 4, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 11, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_load_dmask_not_disjoint_not_merged
 body:             |
@@ -183,15 +184,15 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 11, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 4, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 11, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_load_not_merged_0
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %6, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %6, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_load_not_merged_0
 body:             |
@@ -201,16 +202,16 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %7:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %8:vreg_96 = IMAGE_LOAD_V3_V4 %6, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %7:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %8:vreg_96 = IMAGE_LOAD_V3_V4 %6, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_load_not_merged_1
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %6, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %6, %4, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %6, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %6, %4, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_load_not_merged_1
 body:             |
@@ -221,15 +222,15 @@ body:             |
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %5:vgpr_32 = COPY %2.sub3
-    %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %7:vgpr_32 = IMAGE_LOAD_V1_V4 %6, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %8:vreg_96 = IMAGE_LOAD_V3_V4 %6, %4, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %7:vgpr_32 = IMAGE_LOAD_V1_V4 %6, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %8:vreg_96 = IMAGE_LOAD_V3_V4 %6, %4, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_load_not_merged_10
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, -1, 1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_load_not_merged_10
 body:             |
@@ -239,15 +240,15 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, -1, 1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_load_not_merged_3
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_load_not_merged_3
 body:             |
@@ -257,15 +258,15 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_load_not_merged_4
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 1, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 1, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_load_not_merged_4
 body:             |
@@ -275,15 +276,15 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 1, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 1, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_load_not_merged_5
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 1, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 1, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_load_not_merged_5
 body:             |
@@ -293,15 +294,15 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 1, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 1, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_load_not_merged_6
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 1, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 1, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_load_not_merged_6
 body:             |
@@ -311,15 +312,15 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 1, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 1, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_load_not_merged_7
-# GFX9: %{{[0-9]+}}:vreg_64 = IMAGE_LOAD_V2_V4 %5, %3, 8, 0, 0, 0, 0, 1, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_64 = IMAGE_LOAD_V2_V4 %5, %3, 8, 0, 0, 0, 0, 0, 1, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_load_not_merged_7
 body:             |
@@ -329,15 +330,15 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vreg_64 = IMAGE_LOAD_V2_V4 %5, %3, 8, 0, 0, 0, 0, 1, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
-    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vreg_64 = IMAGE_LOAD_V2_V4 %5, %3, 8, 0, 0, 0, 0, 0, 1, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
+    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_load_not_merged_8
-# GFX9: %{{[0-9]+}}:vreg_64 = IMAGE_LOAD_V2_V4 %5, %3, 8, 0, 0, 0, 0, 0, 1, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_64 = IMAGE_LOAD_V2_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 1, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_load_not_merged_8
 body:             |
@@ -347,15 +348,15 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vreg_64 = IMAGE_LOAD_V2_V4 %5, %3, 8, 0, 0, 0, 0, 0, 1, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
-    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vreg_64 = IMAGE_LOAD_V2_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 1, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
+    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_load_not_merged_9
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_load_not_merged_9
 body:             |
@@ -365,14 +366,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_load_mip_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -384,16 +385,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_LOAD_MIP_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_LOAD_MIP_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_LOAD_MIP_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_LOAD_MIP_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
-
 # GFX9-LABEL: name: image_load_mip_pck_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_PCK_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_PCK_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -405,16 +404,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_LOAD_MIP_PCK_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_LOAD_MIP_PCK_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_LOAD_MIP_PCK_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_LOAD_MIP_PCK_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
-
 # GFX9-LABEL: name: image_load_mip_pck_sgn_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_PCK_SGN_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_PCK_SGN_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -426,14 +423,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_LOAD_MIP_PCK_SGN_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_LOAD_MIP_PCK_SGN_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_LOAD_MIP_PCK_SGN_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_LOAD_MIP_PCK_SGN_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_load_pck_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_PCK_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_PCK_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -445,14 +442,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_LOAD_PCK_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_LOAD_PCK_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_LOAD_PCK_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_LOAD_PCK_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_load_pck_sgn_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_PCK_SGN_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_PCK_SGN_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -464,8 +461,9 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_LOAD_PCK_SGN_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_LOAD_PCK_SGN_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_LOAD_PCK_SGN_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_LOAD_PCK_SGN_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
+

diff  --git a/llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx10.mir b/llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx10.mir
index 70923145dd2d..46de4e507049 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx10.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx10.mir
@@ -13,7 +13,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -31,7 +31,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -50,7 +50,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vreg_64 = IMAGE_SAMPLE_L_V2_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 3, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
     %7:vreg_64 = IMAGE_SAMPLE_L_V2_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 12, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
 ...
@@ -69,7 +69,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vreg_64 = IMAGE_SAMPLE_L_V2_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 12, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
     %7:vreg_64 = IMAGE_SAMPLE_L_V2_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 3, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
 ...
@@ -88,7 +88,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
     %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
 ...
@@ -107,7 +107,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
     %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
 ...
@@ -124,12 +124,12 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %9:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %7:vgpr_32, %7:vgpr_32, %7:vgpr_32, %3:sgpr_256, %2:sgpr_128, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
-    %10:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %10:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %11:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
@@ -146,7 +146,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vreg_128 = COPY %2
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     IMAGE_STORE_V4_V2_nsa_gfx10 %4:vreg_128, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, 15, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
     %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
@@ -165,7 +165,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 4, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -183,7 +183,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 4, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 11, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -201,8 +201,8 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %8:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %6, %6, %6, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -221,7 +221,7 @@ body:             |
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %5:vgpr_32 = COPY %2.sub3
-    %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %6, %6, %6, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %8:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %6, %6, %6, %4, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -240,7 +240,7 @@ body:             |
     %3:sgpr_128 = COPY $sgpr92_sgpr93_sgpr94_sgpr95
     %4:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %5:vgpr_32 = COPY %2.sub3
-    %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %6, %6, %6, %4, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %8:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %6, %6, %6, %4, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -258,7 +258,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5, %5, %5, %3, %2, 7, 1, -1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -276,7 +276,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5, %5, %5, %3, %2, 7, 1, -1, 0, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -294,7 +294,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -312,7 +312,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -330,7 +330,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V2_nsa_gfx10 %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -348,7 +348,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vreg_64 = IMAGE_SAMPLE_L_V2_V3_nsa_gfx10 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 8, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -366,7 +366,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vreg_64 = IMAGE_SAMPLE_L_V2_V3_nsa_gfx10 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 8, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -384,7 +384,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -406,7 +406,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_V1_V2_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_V3_V2_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -426,7 +426,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_B_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_B_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -446,7 +446,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_B_CL_V1_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_B_CL_V3_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -466,7 +466,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_B_CL_O_V1_V5_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_B_CL_O_V3_V5_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -486,7 +486,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_B_O_V1_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_B_O_V3_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -506,7 +506,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_C_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -526,7 +526,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_CD_V1_V6_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_CD_V3_V6_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -546,7 +546,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_CD_CL_V1_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_CD_CL_V3_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -566,7 +566,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_CD_CL_O_V1_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_CD_CL_O_V3_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -586,7 +586,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_CD_O_V1_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_CD_O_V3_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -606,7 +606,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_CL_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_CL_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -626,7 +626,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_CL_O_V1_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_CL_O_V3_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -646,7 +646,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_B_V1_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_C_B_V3_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -666,7 +666,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_B_CL_V1_V5_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_C_B_CL_V3_V5_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -686,7 +686,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_B_CL_O_V1_V6_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_C_B_CL_O_V3_V6_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -706,7 +706,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_B_O_V1_V5_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_C_B_O_V3_V5_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -726,7 +726,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_CD_V1_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_C_CD_V3_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -746,7 +746,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_CD_CL_V1_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_C_CD_CL_V3_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -766,7 +766,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_CD_CL_O_V1_V9_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_C_CD_CL_O_V3_V9_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -786,7 +786,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_CD_O_V1_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_C_CD_O_V3_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -806,7 +806,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_CL_V1_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_C_CL_V3_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -826,7 +826,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_CL_O_V1_V5_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_C_CL_O_V3_V5_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -846,7 +846,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_D_V1_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_C_D_V3_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -866,7 +866,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_V1_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_V3_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -886,7 +886,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_O_V1_V9_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_O_V3_V9_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -906,7 +906,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_D_O_V1_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_C_D_O_V3_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -926,7 +926,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_L_V1_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_C_L_V3_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -946,7 +946,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_LZ_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_C_LZ_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -966,7 +966,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_LZ_O_V1_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_C_LZ_O_V3_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -986,7 +986,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_L_O_V1_V5_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_C_L_O_V3_V5_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -1006,7 +1006,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_O_V1_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_C_O_V3_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -1026,7 +1026,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_D_V1_V6_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_D_V3_V6_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -1046,7 +1046,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_D_CL_V1_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_D_CL_V3_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -1066,7 +1066,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_D_CL_O_V1_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_D_CL_O_V3_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -1086,7 +1086,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_D_O_V1_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_D_O_V3_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -1106,7 +1106,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -1126,7 +1126,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_LZ_O_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_LZ_O_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -1146,7 +1146,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_L_O_V1_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_L_O_V3_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
@@ -1166,7 +1166,7 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_O_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_96 = IMAGE_SAMPLE_O_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/merge-image-sample.mir b/llvm/test/CodeGen/AMDGPU/merge-image-sample.mir
index 304b67786fa3..1c51d3a4a104 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-image-sample.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-image-sample.mir
@@ -1,7 +1,7 @@
 # RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s
 
 # GFX9-LABEL: name: image_sample_l_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -13,13 +13,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
+
 # GFX9-LABEL: name: image_sample_l_merged_v1v3_reversed
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub3
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub0_sub1_sub2
 
@@ -31,14 +32,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_sample_l_merged_v2v2
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = COPY %8.sub0_sub1
 # GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %8.sub2_sub3
 
@@ -50,14 +51,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 3, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
-    %7:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 12, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 3, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
+    %7:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 12, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_sample_l_merged_v2v2_reversed
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = COPY %8.sub2_sub3
 # GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %8.sub0_sub1
 
@@ -69,14 +70,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 12, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
-    %7:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 3, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 12, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
+    %7:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 3, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_sample_l_merged_v3v1
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY %8.sub0_sub1_sub2
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %8.sub3
 
@@ -88,14 +89,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
-    %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_sample_l_merged_v3v1_reversed
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY %8.sub1_sub2_sub3
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %8.sub0
 
@@ -107,14 +108,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
-    %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_sample_l_divided_merged
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 
 name: image_sample_l_divided_merged
 body:             |
@@ -124,19 +125,19 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %8:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %9:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %7:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
-    %10:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %11:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %8:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %9:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %7:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %10:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %11:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_sample_l_divided_not_merged
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_sample_l_divided_not_merged
 body:             |
@@ -146,16 +147,16 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vreg_128 = COPY %2
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    IMAGE_STORE_V4_V4 %4:vreg_128, %5:vreg_128, %3:sgpr_256, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
-    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    IMAGE_STORE_V4_V4 %4:vreg_128, %5:vreg_128, %3:sgpr_256, 15, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
+    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_sample_l_dmask_overlapped_not_merged
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 4, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_sample_l_dmask_overlapped_not_merged
 body:             |
@@ -165,15 +166,15 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 4, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_sample_l_dmask_not_disjoint_not_merged
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 11, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 4, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 11, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_sample_l_dmask_not_disjoint_not_merged
 body:             |
@@ -183,15 +184,15 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 11, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 4, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 11, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_sample_l_not_merged_0
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_sample_l_not_merged_0
 body:             |
@@ -201,16 +202,16 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %8:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %8:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_sample_l_not_merged_1
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %6, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %4, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %6, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %4, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_sample_l_not_merged_1
 body:             |
@@ -221,15 +222,15 @@ body:             |
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %5:vgpr_32 = COPY %2.sub3
-    %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %6, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %8:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %4, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %6, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %8:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %4, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_sample_l_not_merged_2
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %6, %4, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %4, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %6, %4, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %4, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_sample_l_not_merged_2
 body:             |
@@ -240,15 +241,15 @@ body:             |
     %3:sgpr_128 = COPY $sgpr92_sgpr93_sgpr94_sgpr95
     %4:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %5:vgpr_32 = COPY %2.sub3
-    %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %6, %4, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %8:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %4, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %6, %4, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %8:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %4, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_sample_l_not_merged_3
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_sample_l_not_merged_3
 body:             |
@@ -258,15 +259,15 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_sample_l_not_merged_4
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 1, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 1, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_sample_l_not_merged_4
 body:             |
@@ -276,15 +277,15 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 1, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 1, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_sample_l_not_merged_5
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 1, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 1, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_sample_l_not_merged_5
 body:             |
@@ -294,15 +295,15 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 1, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 1, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_sample_l_not_merged_6
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 1, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 1, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_sample_l_not_merged_6
 body:             |
@@ -312,15 +313,15 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 1, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 1, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_sample_l_not_merged_7
-# GFX9: %{{[0-9]+}}:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5, %3, %2, 8, 0, 0, 0, 0, 1, 0, -1, 0, implicit $exec :: (dereferenceable load 8, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 1, 0, -1, 0, implicit $exec :: (dereferenceable load 8, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_sample_l_not_merged_7
 body:             |
@@ -330,15 +331,15 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5, %3, %2, 8, 0, 0, 0, 0, 1, 0, -1, 0, implicit $exec :: (dereferenceable load 8, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 1, 0, -1, 0, implicit $exec :: (dereferenceable load 8, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_sample_l_not_merged_8
-# GFX9: %{{[0-9]+}}:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 1, -1, 0, implicit $exec :: (dereferenceable load 8, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 1, -1, 0, implicit $exec :: (dereferenceable load 8, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_sample_l_not_merged_8
 body:             |
@@ -348,15 +349,15 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 1, -1, 0, implicit $exec :: (dereferenceable load 8, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 1, -1, 0, implicit $exec :: (dereferenceable load 8, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_sample_l_not_merged_9
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_sample_l_not_merged_9
 body:             |
@@ -366,15 +367,15 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: image_sample_l_not_merged_10
-# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 
 name: image_sample_l_not_merged_10
 body:             |
@@ -384,17 +385,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
-
-
 # GFX9-LABEL: name: image_sample_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -406,15 +404,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_b_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -426,15 +423,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_B_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_B_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_B_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_B_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_b_cl_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -446,15 +442,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_B_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_B_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_B_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_B_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_b_cl_o_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -466,15 +461,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_B_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_B_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_B_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_B_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_b_o_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -486,15 +480,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_B_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_B_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_B_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_B_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_c_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -506,15 +499,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_C_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_C_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_C_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_cd_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -526,15 +518,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_CD_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_CD_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_CD_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_CD_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_cd_cl_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -546,15 +537,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_CD_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_CD_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_CD_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_CD_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_cd_cl_o_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -566,15 +556,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_CD_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_CD_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_CD_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_CD_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_cd_o_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -586,15 +575,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_CD_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_CD_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_CD_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_CD_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_cl_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -606,15 +594,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_cl_o_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -626,15 +613,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_c_b_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -646,15 +632,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_C_B_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_B_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_C_B_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_C_B_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_c_b_cl_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -666,15 +651,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_C_B_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_B_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_C_B_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_C_B_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_c_b_cl_o_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -686,15 +670,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_C_B_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_B_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_C_B_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_C_B_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_c_b_o_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -706,15 +689,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_C_B_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_B_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_C_B_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_C_B_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_c_cd_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -726,15 +708,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_C_CD_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_CD_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_C_CD_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_C_CD_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_c_cd_cl_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -746,15 +727,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_C_CD_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_CD_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_C_CD_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_C_CD_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_c_cd_cl_o_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -766,15 +746,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_C_CD_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_CD_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_C_CD_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_C_CD_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_c_cd_o_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -786,15 +765,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_C_CD_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_CD_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_C_CD_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_C_CD_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_c_cl_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -806,15 +784,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_C_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_C_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_C_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_c_cl_o_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -826,15 +803,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_C_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_C_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_C_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_c_d_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -846,15 +822,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_C_D_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_D_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_C_D_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_C_D_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_c_d_cl_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -866,15 +841,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_c_d_cl_o_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -886,15 +860,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_c_d_o_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -906,15 +879,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_C_D_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_D_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_C_D_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_C_D_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_c_l_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -926,15 +898,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_C_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_C_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_C_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_c_lz_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_LZ_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_LZ_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -946,15 +917,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_C_LZ_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_LZ_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_C_LZ_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_C_LZ_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_c_lz_o_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_LZ_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_LZ_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -966,15 +936,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_C_LZ_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_LZ_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_C_LZ_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_C_LZ_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_c_l_o_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_L_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_L_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -986,15 +955,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_C_L_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_L_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_C_L_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_C_L_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_c_o_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -1006,15 +974,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_C_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_C_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_C_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_d_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -1026,15 +993,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_D_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_D_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_D_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_D_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_d_cl_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -1046,15 +1012,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_D_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_D_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_D_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_D_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_d_cl_o_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -1066,15 +1031,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_D_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_D_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_D_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_D_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_d_o_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -1086,15 +1050,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_D_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_D_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_D_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_D_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_lz_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_LZ_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_LZ_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -1106,15 +1069,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_LZ_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_LZ_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_lz_o_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_LZ_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_LZ_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -1126,15 +1088,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_LZ_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_LZ_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_LZ_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_LZ_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_l_o_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -1146,15 +1107,14 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_L_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_L_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_L_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_L_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
-
 # GFX9-LABEL: name: image_sample_o_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
@@ -1166,8 +1126,8 @@ body:             |
     %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
     %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0
     %4:vgpr_32 = COPY %2.sub3
-    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %6:vgpr_32 = IMAGE_SAMPLE_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
+    %6:vgpr_32 = IMAGE_SAMPLE_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    %7:vreg_96 = IMAGE_SAMPLE_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---

diff  --git a/llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir b/llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir
new file mode 100644
index 000000000000..7f04cd2f9a27
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir
@@ -0,0 +1,94 @@
+# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+
+# GCN-LABEL: name: ds_read_b32_v_v
+# GCN: vreg_64 = DS_READ2_B32
+name: ds_read_b32_v_v
+body:             |
+  bb.0:
+
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = DS_READ_B32_gfx9 %0, 0, 0, implicit $exec :: (load 4 from `i32 addrspace(3)* undef`)
+    %2:vgpr_32 = DS_READ_B32_gfx9 %0, 8, 0, implicit $exec :: (load 4 from `i32 addrspace(3)* undef`)
+...
+
+# GCN-LABEL: name: ds_read_b32_a_a
+# GCN: areg_64 = DS_READ2_B32
+name: ds_read_b32_a_a
+body:             |
+  bb.0:
+
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:agpr_32 = DS_READ_B32_gfx9 %0, 0, 0, implicit $exec :: (load 4 from `i32 addrspace(3)* undef`)
+    %2:agpr_32 = DS_READ_B32_gfx9 %0, 8, 0, implicit $exec :: (load 4 from `i32 addrspace(3)* undef`)
+...
+
+# GCN-LABEL: name: ds_read_b32_v_a
+# GCN: vgpr_32 = DS_READ_B32
+# GCN: agpr_32 = DS_READ_B32
+name: ds_read_b32_v_a
+body:             |
+  bb.0:
+
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = DS_READ_B32_gfx9 %0, 0, 0, implicit $exec :: (load 4 from `i32 addrspace(3)* undef`)
+    %2:agpr_32 = DS_READ_B32_gfx9 %0, 8, 0, implicit $exec :: (load 4 from `i32 addrspace(3)* undef`)
+...
+
+# GCN-LABEL: name: ds_read_b32_a_v
+# GCN: agpr_32 = DS_READ_B32
+# GCN: vgpr_32 = DS_READ_B32
+name: ds_read_b32_a_v
+body:             |
+  bb.0:
+
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:agpr_32 = DS_READ_B32_gfx9 %0, 8, 0, implicit $exec :: (load 4 from `i32 addrspace(3)* undef`)
+    %2:vgpr_32 = DS_READ_B32_gfx9 %0, 0, 0, implicit $exec :: (load 4 from `i32 addrspace(3)* undef`)
+...
+
+# GCN-LABEL: name: ds_write_b32_v_v
+# GCN: DS_WRITE2_B32_gfx9 %0, undef %1:vgpr_32, undef %2:vgpr_32
+name: ds_write_b32_v_v
+body:             |
+  bb.0:
+
+    %0:vgpr_32 = IMPLICIT_DEF
+    DS_WRITE_B32_gfx9 %0, undef %1:vgpr_32, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32_gfx9 %0, undef %2:vgpr_32, 8, 0, implicit $exec :: (store 4 into `i32 addrspace(3)* undef`)
+...
+
+# GCN-LABEL: name: ds_write_b32_a_a
+# GCN: DS_WRITE_B32_gfx9 %0, undef %1:agpr_32
+# GCN: DS_WRITE_B32_gfx9 %0, undef %2:agpr_32
+name: ds_write_b32_a_a
+body:             |
+  bb.0:
+
+    %0:vgpr_32 = IMPLICIT_DEF
+    DS_WRITE_B32_gfx9 %0, undef %1:agpr_32, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32_gfx9 %0, undef %2:agpr_32, 8, 0, implicit $exec :: (store 4 into `i32 addrspace(3)* undef`)
+...
+
+# GCN-LABEL: name: ds_write_b32_v_a
+# GCN: DS_WRITE_B32_gfx9 %0, undef %1:vgpr_32
+# GCN: DS_WRITE_B32_gfx9 %0, undef %2:agpr_32
+name: ds_write_b32_v_a
+body:             |
+  bb.0:
+
+    %0:vgpr_32 = IMPLICIT_DEF
+    DS_WRITE_B32_gfx9 %0, undef %1:vgpr_32, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32_gfx9 %0, undef %2:agpr_32, 8, 0, implicit $exec :: (store 4 into `i32 addrspace(3)* undef`)
+...
+
+# GCN-LABEL: name: ds_write_b32_a_v
+# GCN: DS_WRITE_B32_gfx9 %0, undef %1:agpr_32
+# GCN: DS_WRITE_B32_gfx9 %0, undef %2:vgpr_32
+name: ds_write_b32_a_v
+body:             |
+  bb.0:
+
+    %0:vgpr_32 = IMPLICIT_DEF
+    DS_WRITE_B32_gfx9 %0, undef %1:agpr_32, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32_gfx9 %0, undef %2:vgpr_32, 8, 0, implicit $exec :: (store 4 into `i32 addrspace(3)* undef`)
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/merge-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-load-store.mir
index 743594b91bbe..ac78dbdc40e1 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-load-store.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-load-store.mir
@@ -170,10 +170,10 @@ body:             |
 ---
 # CHECK-LABEL: merge_mmos
 # CHECK: S_BUFFER_LOAD_DWORDX2_IMM %0, 0, 0, 0 :: (dereferenceable invariant load 8, align 4)
-# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 4)
-# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 4)
-# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from %ir.ptr_addr1 + 64, align 4
-# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into %ir.ptr_addr1 + 64, align 4
+# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 4)
+# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 4)
+# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from %ir.ptr_addr1 + 64, align 4
+# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into %ir.ptr_addr1 + 64, align 4
 name: merge_mmos
 tracksRegLiveness: true
 body: |
@@ -183,22 +183,22 @@ body: |
     %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
     %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0, 0, 0, 0 :: (dereferenceable invariant load 4)
     %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0, 1, 0, 0 :: (dereferenceable invariant load 4)
-    %3:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4)
-    %4:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4)
-    BUFFER_STORE_DWORD_OFFSET_exact %3, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4)
-    BUFFER_STORE_DWORD_OFFSET_exact %4, %0, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4)
-    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from %ir.ptr_addr1 + 64)
-    %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from %ir.ptr_addr1 + 68)
-    BUFFER_STORE_DWORD_OFFSET_exact %5, %0, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.ptr_addr1 + 64)
-    BUFFER_STORE_DWORD_OFFSET_exact %6, %0, 0, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.ptr_addr1 + 68)
+    %3:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4)
+    %4:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4)
+    BUFFER_STORE_DWORD_OFFSET_exact %3, %0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4)
+    BUFFER_STORE_DWORD_OFFSET_exact %4, %0, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4)
+    %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from %ir.ptr_addr1 + 64)
+    %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from %ir.ptr_addr1 + 68)
+    BUFFER_STORE_DWORD_OFFSET_exact %5, %0, 0, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.ptr_addr1 + 64)
+    BUFFER_STORE_DWORD_OFFSET_exact %6, %0, 0, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.ptr_addr1 + 68)
 
     S_ENDPGM 0
 
 ...
 ---
 # CHECK-LABEL: reorder_offsets
-# CHECK-DAG: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into %ir.reorder_addr1 + 16, align 4, addrspace 1)
-# CHECK-DAG: BUFFER_STORE_DWORDX4_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into %ir.reorder_addr1, align 4, addrspace 1)
+# CHECK-DAG: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into %ir.reorder_addr1 + 16, align 4, addrspace 1)
+# CHECK-DAG: BUFFER_STORE_DWORDX4_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into %ir.reorder_addr1, align 4, addrspace 1)
 
 name: reorder_offsets
 tracksRegLiveness: true
@@ -208,12 +208,12 @@ body: |
 
     %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
     %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 4)
-    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 8, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 8)
-    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 12, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 12)
-    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 16)
-    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 20)
-    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1)
+    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 4)
+    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 8)
+    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 12, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 12)
+    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 16)
+    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 20)
+    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1)
     S_ENDPGM 0
 
 

diff  --git a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
index 1f10a047f7d9..11395e99e523 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
@@ -6,7 +6,7 @@
 #
 
 # GFX9-LABEL: name: gfx9_tbuffer_load_x_xyz
-# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %7.sub1_sub2_sub3
 name: gfx9_tbuffer_load_x_xyz
@@ -17,13 +17,13 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 8, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 8, 125, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: gfx9_tbuffer_load_xyz_x
-# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY %7.sub0_sub1_sub2
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub3
 name: gfx9_tbuffer_load_xyz_x
@@ -34,13 +34,13 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %7:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 4, 125, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: gfx9_tbuffer_load_xy_xy
-# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = COPY %7.sub0_sub1
 # GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %7.sub2_sub3
 name: gfx9_tbuffer_load_xy_xy
@@ -51,13 +51,13 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 12, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 12, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: gfx9_tbuffer_load_x_xy
-# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 4, 125, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0
 # GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %7.sub1_sub2
 name: gfx9_tbuffer_load_x_xy
@@ -68,13 +68,13 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 8, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 8, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: gfx9_tbuffer_load_xy_x
-# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 4, 125, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = COPY %7.sub0_sub1
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub2
 name: gfx9_tbuffer_load_xy_x
@@ -85,14 +85,14 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 12, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 12, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 ...
 ---
 
 
 # GFX9-LABEL: name: gfx9_tbuffer_load_x_x
-# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub1
 
@@ -104,13 +104,13 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: gfx9_tbuffer_load_x_x_format_32_32_32_32
-# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub1
 
@@ -122,24 +122,24 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 ...
 ---
 
 
 # GFX9-LABEL: name: gfx9_tbuffer_load_float_32
-# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1
-# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3
 # GFX9: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1
-# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 125, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0
@@ -153,30 +153,30 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: gfx9_tbuffer_load_sint_32
-# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 91, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 91, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1
-# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3
 # GFX9: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1
-# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 93, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 93, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0
@@ -190,30 +190,30 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: gfx9_tbuffer_load_uint_32
-# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 75, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1
-# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3
 # GFX9: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1
-# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0
@@ -227,15 +227,15 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 ...
 ---
 
@@ -245,15 +245,15 @@ body:             |
 # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2
 # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3
 # GFX9: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 20, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 28, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 40, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 20, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 24, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 28, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 36, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 40, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 44, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 
 
 name: gfx9_tbuffer_load_not_merged_data_format_mismatch
@@ -264,15 +264,15 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 ...
 ---
 
@@ -282,15 +282,15 @@ body:             |
 # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2
 # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3
 # GFX9: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 20, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 28, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 40, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 20, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 24, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 28, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 36, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 40, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 44, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 name: gfx9_tbuffer_load_not_merged_num_format_mismatch
 body:             |
   bb.0.entry:
@@ -299,22 +299,22 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: gfx9_tbuffer_store_x_xyz
 # GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1, %1, %subreg.sub2
 # GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %9, %subreg.sub1_sub2_sub3
-# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %10, %8, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %10, %8, 0, 4, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
 name: gfx9_tbuffer_store_x_xyz
 body:             |
   bb.0.entry:
@@ -329,8 +329,8 @@ body:             |
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
     %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 8, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 8, 125, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
 ...
 ---
 
@@ -339,7 +339,7 @@ body:             |
 # GFX9-LABEL: name: gfx9_tbuffer_store_xyz_x
 # GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1, %1, %subreg.sub2
 # GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE %9, %subreg.sub0_sub1_sub2, %0, %subreg.sub3
-# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %10, %8, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %10, %8, 0, 4, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
 name: gfx9_tbuffer_store_xyz_x
 body:             |
   bb.0.entry:
@@ -354,8 +354,8 @@ body:             |
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
     %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
-    TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 4, 125, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 ...
 ---
 
@@ -363,7 +363,7 @@ body:             |
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %0, %subreg.sub1
 # GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE %9, %subreg.sub0_sub1, %10, %subreg.sub2_sub3
-# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %11, %8, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %11, %8, 0, 4, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
 name: gfx9_tbuffer_store_xy_xy
 body:             |
   bb.0.entry:
@@ -379,15 +379,15 @@ body:             |
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
     %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
     %15:vreg_64 = REG_SEQUENCE %6:vgpr_32, %subreg.sub0, %7:vgpr_32, %subreg.sub1
-    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 12, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 12, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: gfx9_tbuffer_store_x_xy
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1
 # GFX9: %{{[0-9]+}}:vreg_64, %subreg.sub1_sub2
-# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %11, %8, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %11, %8, 0, 4, 125, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
 name: gfx9_tbuffer_store_x_xy
 body:             |
   bb.0.entry:
@@ -402,15 +402,15 @@ body:             |
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
     %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 8, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 8, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: gfx9_tbuffer_store_xy_x
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1
 # GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE %9, %subreg.sub0_sub1, %0, %subreg.sub2
-# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %10, %8, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %10, %8, 0, 4, 125, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
 name: gfx9_tbuffer_store_xy_x
 body:             |
   bb.0.entry:
@@ -426,15 +426,15 @@ body:             |
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
     %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
-    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 12, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 12, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 ...
 ---
 
 
 # GFX9-LABEL: name: gfx9_tbuffer_store_x_x
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %0, %subreg.sub1
-# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %9, %8, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %9, %8, 0, 4, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
 name: gfx9_tbuffer_store_x_x
 body:             |
   bb.0.entry:
@@ -448,14 +448,14 @@ body:             |
     %1:sgpr_32 = COPY $sgpr1
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 ...
 ---
 
 # GFX9-LABEL: name: gfx9_tbuffer_store_x_x_format_32_32_32_32
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %0, %subreg.sub1
-# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %9, %8, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %9, %8, 0, 4, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
 name: gfx9_tbuffer_store_x_x_format_32_32_32_32
 body:             |
   bb.0.entry:
@@ -469,8 +469,8 @@ body:             |
     %1:sgpr_32 = COPY $sgpr1
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 ...
 ---
 
@@ -490,14 +490,14 @@ body:             |
 # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
 # GFX9: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1
-# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1
 # GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2
 # GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3
-# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1
 # GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2
-# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 125, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
 name: gfx9_tbuffer_store_float32
 body:             |
   bb.0.entry:
@@ -516,15 +516,15 @@ body:             |
     %1:sgpr_32 = COPY $sgpr1
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 ...
 ---
 
@@ -544,14 +544,14 @@ body:             |
 # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
 # GFX9: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1
-# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 91, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 91, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1
 # GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2
 # GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3
-# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1
 # GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2
-# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 93, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 93, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
 name: gfx9_tbuffer_store_sint32
 body:             |
   bb.0.entry:
@@ -570,15 +570,15 @@ body:             |
     %1:sgpr_32 = COPY $sgpr1
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 ...
 ---
 
@@ -598,14 +598,14 @@ body:             |
 # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
 # GFX9: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1
-# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 75, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1
 # GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2
 # GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3
-# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1
 # GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2
-# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
 name: gfx9_tbuffer_store_uint32
 body:             |
   bb.0.entry:
@@ -624,15 +624,15 @@ body:             |
     %1:sgpr_32 = COPY $sgpr1
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 ...
 ---
 
@@ -651,15 +651,15 @@ body:             |
 # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1
 # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
 # GFX9: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3
-# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %13, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %13, 0, 8, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %13, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %13, 0, 20, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %13, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %3, %13, 0, 28, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %2, %13, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %1, %13, 0, 40, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %0, %13, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %13, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %13, 0, 8, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %13, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %13, 0, 20, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %13, 0, 24, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %3, %13, 0, 28, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %2, %13, 0, 36, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %1, %13, 0, 40, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %0, %13, 0, 44, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 name: gfx9_tbuffer_store_not_merged_data_format_mismatch
 body:             |
   bb.0.entry:
@@ -678,15 +678,15 @@ body:             |
     %1:sgpr_32 = COPY $sgpr1
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 ...
 ---
 
@@ -705,15 +705,15 @@ body:             |
 # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1
 # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
 # GFX9: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3
-# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %13, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %13, 0, 8, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %13, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %13, 0, 20, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %13, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %3, %13, 0, 28, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %2, %13, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %1, %13, 0, 40, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %0, %13, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %13, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %13, 0, 8, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %13, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %13, 0, 20, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %13, 0, 24, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %3, %13, 0, 28, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %2, %13, 0, 36, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %1, %13, 0, 40, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %0, %13, 0, 44, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 name: gfx9_tbuffer_store_not_merged_num_format_mismatch
 body:             |
   bb.0.entry:
@@ -732,22 +732,22 @@ body:             |
     %1:sgpr_32 = COPY $sgpr1
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 ...
 ---
 
 
 # GFX9-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_0
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 116, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 116, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 name: gfx9_tbuffer_load_not_merged_swizzled_0
 body:             |
   bb.0.entry:
@@ -756,15 +756,15 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 ...
 ---
 
 
 # GFX9-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_1
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 116, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 116, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 name: gfx9_tbuffer_load_not_merged_swizzled_1
 body:             |
   bb.0.entry:
@@ -773,8 +773,8 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 ...
 ---
 
@@ -784,7 +784,7 @@ body:             |
 #
 
 # GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz
-# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0
 # GFX10: %{{[0-9]+}}:vreg_96 = COPY killed %7.sub1_sub2_sub3
 name: gfx10_tbuffer_load_x_xyz
@@ -795,13 +795,13 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 8, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 8, 74, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
 ...
 ---
 
 # GFX10-LABEL: name: gfx10_tbuffer_load_xyz_x
-# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_96 = COPY %7.sub0_sub1_sub2
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub3
 name: gfx10_tbuffer_load_xyz_x
@@ -812,13 +812,13 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 4, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
+    %7:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 4, 74, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
 ...
 ---
 
 # GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy
-# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_64 = COPY %7.sub0_sub1
 # GFX10: %{{[0-9]+}}:vreg_64 = COPY killed %7.sub2_sub3
 name: gfx10_tbuffer_load_xy_xy
@@ -829,13 +829,13 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 12, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 12, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
 ...
 ---
 
 # GFX10-LABEL: name: gfx10_tbuffer_load_x_xy
-# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 4, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 4, 74, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0
 # GFX10: %{{[0-9]+}}:vreg_64 = COPY killed %7.sub1_sub2
 name: gfx10_tbuffer_load_x_xy
@@ -846,13 +846,13 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 8, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 8, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
 ...
 ---
 
 # GFX10-LABEL: name: gfx10_tbuffer_load_xy_x
-# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 4, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 4, 74, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_64 = COPY %7.sub0_sub1
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub2
 name: gfx10_tbuffer_load_xy_x
@@ -863,14 +863,14 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 12, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 12, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 ...
 ---
 
 
 # GFX10-LABEL: name: gfx10_tbuffer_load_x_x
-# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub1
 
@@ -882,13 +882,13 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 ...
 ---
 
 # GFX10-LABEL: name: gfx10_tbuffer_load_x_x_format_32_32_32_32
-# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub1
 
@@ -900,24 +900,24 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 ...
 ---
 
 
 # GFX10-LABEL: name: gfx10_tbuffer_load_float_32
-# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1
-# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3
 # GFX10: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1
-# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 74, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0
@@ -931,30 +931,30 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 ...
 ---
 
 # GFX10-LABEL: name: gfx10_tbuffer_load_sint_32
-# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 63, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 63, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1
-# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 76, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 76, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3
 # GFX10: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1
-# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 73, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 73, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0
@@ -968,30 +968,30 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 ...
 ---
 
 # GFX10-LABEL: name: gfx10_tbuffer_load_uint_32
-# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 62, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 62, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1
-# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 75, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3
 # GFX10: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1
-# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 72, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2
 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0
@@ -1005,15 +1005,15 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 ...
 ---
 
@@ -1023,15 +1023,15 @@ body:             |
 # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2
 # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3
 # GFX10: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 20, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 28, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 40, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 20, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 24, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 28, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 36, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 40, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 44, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 
 
 name: gfx10_tbuffer_load_not_merged_data_format_mismatch
@@ -1042,15 +1042,15 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 ...
 ---
 
@@ -1060,15 +1060,15 @@ body:             |
 # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2
 # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3
 # GFX10: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 20, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 28, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 40, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 20, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 24, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 28, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 36, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 40, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 44, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 name: gfx10_tbuffer_load_not_merged_num_format_mismatch
 body:             |
   bb.0.entry:
@@ -1077,15 +1077,15 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 ...
 ---
 
@@ -1094,7 +1094,7 @@ body:             |
 # GFX10-LABEL: name: gfx10_tbuffer_store_x_xyz
 # GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1, %1, %subreg.sub2
 # GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %9, %subreg.sub1_sub2_sub3
-# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %10, %8, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %10, %8, 0, 4, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
 name: gfx10_tbuffer_store_x_xyz
 body:             |
   bb.0.entry:
@@ -1109,8 +1109,8 @@ body:             |
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
     %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 8, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 8, 74, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
 ...
 ---
 
@@ -1118,7 +1118,7 @@ body:             |
 # GFX10-LABEL: name: gfx10_tbuffer_store_xyz_x
 # GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1, %1, %subreg.sub2
 # GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE %9, %subreg.sub0_sub1_sub2, %0, %subreg.sub3
-# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %10, %8, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %10, %8, 0, 4, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
 name: gfx10_tbuffer_store_xyz_x
 body:             |
   bb.0.entry:
@@ -1133,8 +1133,8 @@ body:             |
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
     %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
-    TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 4, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 4, 74, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 ...
 ---
 
@@ -1142,7 +1142,7 @@ body:             |
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %0, %subreg.sub1
 # GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE %9, %subreg.sub0_sub1, %10, %subreg.sub2_sub3
-# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %11, %8, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %11, %8, 0, 4, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
 name: gfx10_tbuffer_store_xy_xy
 body:             |
   bb.0.entry:
@@ -1158,15 +1158,15 @@ body:             |
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
     %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
     %15:vreg_64 = REG_SEQUENCE %6:vgpr_32, %subreg.sub0, %7:vgpr_32, %subreg.sub1
-    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 12, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 12, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
 ...
 ---
 
 # GFX10-LABEL: name: gfx10_tbuffer_store_x_xy
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1
 # GFX10: %{{[0-9]+}}:vreg_64, %subreg.sub1_sub2
-# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %11, %8, 0, 4, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %11, %8, 0, 4, 74, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
 name: gfx10_tbuffer_store_x_xy
 body:             |
   bb.0.entry:
@@ -1181,15 +1181,15 @@ body:             |
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
     %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 8, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 8, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
 ...
 ---
 
 # GFX10-LABEL: name: gfx10_tbuffer_store_xy_x
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1
 # GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE %9, %subreg.sub0_sub1, %0, %subreg.sub2
-# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %10, %8, 0, 4, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %10, %8, 0, 4, 74, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
 name: gfx10_tbuffer_store_xy_x
 body:             |
   bb.0.entry:
@@ -1205,15 +1205,15 @@ body:             |
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
     %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
-    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 12, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 12, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 ...
 ---
 
 
 # GFX10-LABEL: name: gfx10_tbuffer_store_x_x
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %0, %subreg.sub1
-# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %9, %8, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %9, %8, 0, 4, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
 name: gfx10_tbuffer_store_x_x
 body:             |
   bb.0.entry:
@@ -1227,14 +1227,14 @@ body:             |
     %1:sgpr_32 = COPY $sgpr1
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 ...
 ---
 
 # GFX10-LABEL: name: gfx10_tbuffer_store_x_x_format_32_32_32_32
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %0, %subreg.sub1
-# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %9, %8, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %9, %8, 0, 4, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
 name: gfx10_tbuffer_store_x_x_format_32_32_32_32
 body:             |
   bb.0.entry:
@@ -1248,8 +1248,8 @@ body:             |
     %1:sgpr_32 = COPY $sgpr1
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 ...
 ---
 
@@ -1269,14 +1269,14 @@ body:             |
 # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
 # GFX10: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1
-# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1
 # GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2
 # GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3
-# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1
 # GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2
-# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 74, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
 name: gfx10_tbuffer_store_float32
 body:             |
   bb.0.entry:
@@ -1295,15 +1295,15 @@ body:             |
     %1:sgpr_32 = COPY $sgpr1
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 ...
 ---
 
@@ -1323,14 +1323,14 @@ body:             |
 # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
 # GFX10: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1
-# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 63, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 63, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1
 # GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2
 # GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3
-# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 76, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 76, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1
 # GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2
-# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 73, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 73, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
 name: gfx10_tbuffer_store_sint32
 body:             |
   bb.0.entry:
@@ -1349,15 +1349,15 @@ body:             |
     %1:sgpr_32 = COPY $sgpr1
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 ...
 ---
 
@@ -1377,14 +1377,14 @@ body:             |
 # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
 # GFX10: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1
-# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 62, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 62, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1
 # GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2
 # GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3
-# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 75, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1
 # GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2
-# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 72, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
 name: gfx10_tbuffer_store_uint32
 body:             |
   bb.0.entry:
@@ -1403,15 +1403,15 @@ body:             |
     %1:sgpr_32 = COPY $sgpr1
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 ...
 ---
 
@@ -1430,15 +1430,15 @@ body:             |
 # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1
 # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
 # GFX10: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3
-# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %13, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %13, 0, 8, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %13, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %13, 0, 20, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %13, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %3, %13, 0, 28, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %2, %13, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %1, %13, 0, 40, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %0, %13, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %13, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %13, 0, 8, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %13, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %13, 0, 20, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %13, 0, 24, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %3, %13, 0, 28, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %2, %13, 0, 36, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %1, %13, 0, 40, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %0, %13, 0, 44, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 name: gfx10_tbuffer_store_not_merged_data_format_mismatch
 body:             |
   bb.0.entry:
@@ -1457,15 +1457,15 @@ body:             |
     %1:sgpr_32 = COPY $sgpr1
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 ...
 ---
 
@@ -1484,15 +1484,15 @@ body:             |
 # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1
 # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
 # GFX10: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3
-# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %13, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %13, 0, 8, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %13, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %13, 0, 20, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %13, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %3, %13, 0, 28, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %2, %13, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %1, %13, 0, 40, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %0, %13, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %13, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %13, 0, 8, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %13, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %13, 0, 20, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %13, 0, 24, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %3, %13, 0, 28, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %2, %13, 0, 36, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %1, %13, 0, 40, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %0, %13, 0, 44, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 name: gfx10_tbuffer_store_not_merged_num_format_mismatch
 body:             |
   bb.0.entry:
@@ -1511,22 +1511,22 @@ body:             |
     %1:sgpr_32 = COPY $sgpr1
     %0:sgpr_32 = COPY $sgpr0
     %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
 ...
 ---
 
 
 # GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_0
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 22, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 22, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 name: gfx10_tbuffer_load_not_merged_swizzled_0
 body:             |
   bb.0.entry:
@@ -1535,15 +1535,15 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 ...
 ---
 
 
 # GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_1
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 22, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 22, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 name: gfx10_tbuffer_load_not_merged_swizzled_1
 body:             |
   bb.0.entry:
@@ -1552,8 +1552,8 @@ body:             |
     %2:sgpr_32 = COPY $sgpr2
     %3:sgpr_32 = COPY $sgpr3
     %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
 ...
 ---
 

diff  --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index 63c4e5c49656..405d56f12249 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -1,20 +1,26 @@
-; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908_A %s
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX908_A %s
 
 ; GCN-LABEL: {{^}}test_mfma_loop_zeroinit:
 
-; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX90A:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
+; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
 
 ; Check that we do not copy agprs to vgprs and back inside the loop.
 
 ; GCN: [[LOOP:BB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
-; GCN: v_mfma_f32_32x32x1f32
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[LOOP]]
+; GCN-NOT:  v_accvgpr
+; GFX908_A: v_mfma_f32_32x32x1f32
+; GCN-NOT:  v_accvgpr
+; GCN:      s_cbranch_scc1 [[LOOP]]
 
 ; Final result should be read only once after the loop.
 
-; GCN-COUNT-32: v_accvgpr_read_b32
+; GFX908-COUNT-32: v_accvgpr_read_b32
+; GFX90A-NOT:      v_accvgpr_read_b32
+; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
+; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 
 define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
 entry:
@@ -39,16 +45,21 @@ exit:
 ; 3 vgprs are needed to avoid wait states between writes.
 ; Check that we do not use 32 temp sgprs as well.
 
-; GCN:         v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000
-; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX908_A:        v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000
+; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]]
+; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
 
 ; GCN: [[LOOP:BB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
-; GCN: v_mfma_f32_32x32x1f32
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[LOOP]]
+; GCN-NOT:  v_accvgpr
+; GFX908_A: v_mfma_f32_32x32x1f32
+; GCN-NOT:  v_accvgpr
+; GCN:      s_cbranch_scc1 [[LOOP]]
 
-; GCN-COUNT-32: v_accvgpr_read_b32
+; GFX908-COUNT-32: v_accvgpr_read_b32
+; GFX90A-NOT:      v_accvgpr_read_b32
+; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
+; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 
 define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(<32 x float> addrspace(1)* %arg) {
 entry:
@@ -69,17 +80,21 @@ exit:
 
 ; GCN-LABEL: {{^}}test_mfma_loop_non_splat:
 
-; GCN:         v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN:         v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}}
-; GCN-COUNT-30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GCN:             v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0{{$}}
+; GCN:             v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}}
+; GFX908-COUNT-30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX90A-COUNT-30: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]{{$}}
 
 ; GCN: [[LOOP:BB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
-; GCN: v_mfma_f32_32x32x1f32
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[LOOP]]
+; GCN-NOT:  v_accvgpr
+; GFX908_A: v_mfma_f32_32x32x1f32
+; GCN-NOT:  v_accvgpr
+; GCN:      s_cbranch_scc1 [[LOOP]]
 
-; GCN-COUNT-32: v_accvgpr_read_b32
+; GFX908-COUNT-32: v_accvgpr_read_b32
+; GFX90A-NOT:      v_accvgpr_read_b32
+; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
+; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 
 define amdgpu_kernel void @test_mfma_loop_non_splat(<32 x float> addrspace(1)* %arg) {
 entry:
@@ -103,78 +118,147 @@ exit:
 ; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
 ; 3 vgprs are needed to avoid wait states between writes.
 
-; GCN: v_mov_b32_e32 [[TMP1:v[0-9]+]], 0x42f60000
-; GCN: v_mov_b32_e32 [[TMP2:v[0-9]+]], 0x42f80000
-; GCN: v_mov_b32_e32 [[TMP3:v[0-9]+]], 0x42fe0000
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
-; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
-; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
-; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
-; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
-; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
-; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
-; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
-; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
-; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
-; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
-; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
-; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
-; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
-; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
-; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
-; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
-; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
-; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
-; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
-; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
-; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
-; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
-; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
-; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
-; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
-; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
-; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
-; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
-; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+; GFX908: v_mov_b32_e32 [[TMP1:v[0-9]+]], 0x42f60000
+; GFX908: v_mov_b32_e32 [[TMP2:v[0-9]+]], 0x42f80000
+; GFX908: v_mov_b32_e32 [[TMP3:v[0-9]+]], 0x42fe0000
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
+; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
+; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
+; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
+; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
+; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
+; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
+; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
+; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
+; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
+; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
+; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
+; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
+; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
+; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
+; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
+; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
+; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
+; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
+; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
+; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+
 
 ; GCN: [[LOOP:BB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
-; GCN: v_mfma_f32_32x32x1f32
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[LOOP]]
+; GCN-NOT:  v_accvgpr
+; GFX908_A: v_mfma_f32_32x32x1f32
+; GCN-NOT:  v_accvgpr
+; GCN:      s_cbranch_scc1 [[LOOP]]
 
-; GCN-COUNT-32: v_accvgpr_read_b32
+; GFX908-COUNT-32: v_accvgpr_read_b32
+; GFX90A-NOT:      v_accvgpr_read_b32
+; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
+; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 
 define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(<32 x float> addrspace(1)* %arg) {
 entry:
@@ -198,12 +282,15 @@ exit:
 ; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v0{{$}}
 
 ; GCN: [[LOOP:BB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
-; GCN: v_mfma_f32_32x32x1f32
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[LOOP]]
+; GCN-NOT:  v_accvgpr
+; GFX908_A: v_mfma_f32_32x32x1f32
+; GCN-NOT:  v_accvgpr
+; GCN:      s_cbranch_scc1 [[LOOP]]
 
-; GCN-COUNT-32: v_accvgpr_read_b32
+; GFX908-COUNT-32: v_accvgpr_read_b32
+; GFX90A-NOT:      v_accvgpr_read_b32
+; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
+; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 
 define amdgpu_kernel void @test_mfma_loop_vgpr_init(<32 x float> addrspace(1)* %arg) {
 entry:
@@ -259,16 +346,21 @@ exit:
 
 ; GCN-LABEL: {{^}}test_mfma_loop_sgpr_init:
 
-; GCN:         v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
-; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX908_A:        v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
+; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX90A:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]]
+; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
 
 ; GCN: [[LOOP:BB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
-; GCN: v_mfma_f32_32x32x1f32
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[LOOP]]
+; GCN-NOT:  v_accvgpr
+; GFX908_A: v_mfma_f32_32x32x1f32
+; GCN-NOT:  v_accvgpr
+; GCN:      s_cbranch_scc1 [[LOOP]]
 
-; GCN-COUNT-32: v_accvgpr_read_b32
+; GFX908-COUNT-32: v_accvgpr_read_b32
+; GFX90A-NOT:      v_accvgpr_read_b32
+; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
+; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 
 define amdgpu_kernel void @test_mfma_loop_sgpr_init(<32 x float> addrspace(1)* %arg, float %init) {
 entry:
@@ -322,47 +414,53 @@ exit:
 
 ; GCN-LABEL: {{^}}test_mfma_loop_mixed_init:
 
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v0
-; GCN-DAG: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GCN-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v0
+; GFX908_A-DAG: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
+; GCN-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+
+; GFX90A-DAG:      v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
+; GFX90A-COUNT-28: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
 
 ; GCN: [[LOOP:BB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
-; GCN: v_mfma_f32_32x32x1f32
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[LOOP]]
+; GCN-NOT:  v_accvgpr
+; GFX908_A: v_mfma_f32_32x32x1f32
+; GCN-NOT:  v_accvgpr
+; GCN:      s_cbranch_scc1 [[LOOP]]
 
-; GCN-COUNT-32: v_accvgpr_read_b32
+; GFX908-COUNT-32: v_accvgpr_read_b32
+; GFX90A-NOT:      v_accvgpr_read_b32
+; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
+; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 
 define amdgpu_kernel void @test_mfma_loop_mixed_init(<32 x float> addrspace(1)* %arg, float %x) {
 entry:
@@ -388,17 +486,24 @@ exit:
 
 ; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init:
 
-; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN: v_mfma_f32_32x32x1f32
-; GCN-NOT: v_accvgpr
+; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}]
+; GFX90A-NOT:      v_accvgpr
+; GFX90A:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
+; GFX90A-NOT:      v_accvgpr
+; GCN-NOT:         v_accvgpr
 
 ; GCN: [[LOOP:BB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
-; GCN: v_mfma_f32_32x32x1f32
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[LOOP]]
+; GCN-NOT:  v_accvgpr
+; GFX908_A: v_mfma_f32_32x32x1f32
+; GCN-NOT:  v_accvgpr
+; GCN:      s_cbranch_scc1 [[LOOP]]
+
+; GFX908-COUNT-32: v_accvgpr_read_b32
+; GFX90A-NOT:      v_accvgpr_read_b32
+; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
+; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 
-; GCN-COUNT-32: v_accvgpr_read_b32
 define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(<32 x float> addrspace(1)* %arg) {
 entry:
   %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
@@ -420,21 +525,30 @@ exit:
 
 ; GCN-LABEL: {{^}}test_mfma_loop_agpr_init:
 
-; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN: v_mfma_f32_32x32x1f32
+; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}]
+; GFX90A-NOT:      v_accvgpr
+; GFX90A:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
+; GFX90A-NOT:      v_accvgpr
 
 ; Check that we are using only one tmp VGPR.
 
-; GCN: v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}}
-; GCN-COUNT-31: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]{{$}}
+; GCN:             v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}}
+; GFX908-COUNT-31: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]{{$}}
+; GFX90A:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]]{{$}}
+; GFX90A-COUNT-29: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
 
 ; GCN: [[LOOP:BB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
-; GCN: v_mfma_f32_32x32x1f32
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[LOOP]]
+; GCN-NOT:  v_accvgpr
+; GFX908_A: v_mfma_f32_32x32x1f32
+; GCN-NOT:  v_accvgpr
+; GCN:      s_cbranch_scc1 [[LOOP]]
+
+; GFX908-COUNT-32: v_accvgpr_read_b32
+; GFX90A-NOT:      v_accvgpr_read_b32
+; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
+; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 
-; GCN-COUNT-32: v_accvgpr_read_b32
 define amdgpu_kernel void @test_mfma_loop_agpr_init(<32 x float> addrspace(1)* %arg) {
 entry:
   %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
@@ -489,23 +603,28 @@ exit:
 
 ; GCN-LABEL: {{^}}test_mfma_nested_loop_zeroinit:
 
-; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GFX90A:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
+; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
 
 ; Check that we do not copy agprs to vgprs and back in an outer loop.
 
 ; GCN: [[OUTER_LOOP:BB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
+; GCN-NOT:  v_accvgpr
 ; GCN: [[INNER_LOOP:BB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
-; GCN: v_mfma_f32_32x32x1f32
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[INNER_LOOP]]
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[OUTER_LOOP]]
+; GCN-NOT:  v_accvgpr
+; GFX908_A: v_mfma_f32_32x32x1f32
+; GCN-NOT:  v_accvgpr
+; GCN:      s_cbranch_scc1 [[INNER_LOOP]]
+; GCN-NOT:  v_accvgpr
+; GCN:      s_cbranch_scc1 [[OUTER_LOOP]]
 
 ; Final result should be read only once after the loop.
 
-; GCN-COUNT-32: v_accvgpr_read_b32
+; GFX908-COUNT-32: v_accvgpr_read_b32
+; GFX90A-NOT:      v_accvgpr_read_b32
+; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
+; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 
 define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
index c59f89436fb8..cd0e6231e4b1 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
@@ -28,7 +28,7 @@
 # W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc
 # W64: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3
 # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec
-# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec
+# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc
 # W64: S_CBRANCH_EXECNZ %bb.1, implicit $exec
 # W64-LABEL:  bb.2:
@@ -52,7 +52,7 @@
 # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc
 # W32: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3
 # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec
-# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec
+# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 # TODO: S_XOR_B32_term should be `implicit-def $scc`
 # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]]
 # W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec
@@ -77,7 +77,7 @@ body:             |
     %1:vgpr_32 = COPY $vgpr1
     %0:vgpr_32 = COPY $vgpr0
     %6:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
-    %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $sgpr30_sgpr31 = COPY %5
     $vgpr0 = COPY %7
     S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
@@ -101,7 +101,7 @@ body:             |
 # W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc
 # W64: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3
 # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec
-# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec
+# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc
 # W64: S_CBRANCH_EXECNZ %bb.1, implicit $exec
 # W64-LABEL:  bb.2:
@@ -125,7 +125,7 @@ body:             |
 # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc
 # W32: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3
 # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec
-# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec
+# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 # TODO: S_XOR_B32_term should be `implicit-def $scc`
 # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]]
 # W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec
@@ -150,7 +150,7 @@ body:             |
     %1:vgpr_32 = COPY $vgpr1
     %0:vgpr_32 = COPY $vgpr0
     %6:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
-    %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $sgpr30_sgpr31 = COPY %5
     $vgpr0 = COPY %7
     S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
@@ -174,7 +174,7 @@ body:             |
 # W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc
 # W64: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3
 # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec
-# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec
+# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc
 # W64: S_CBRANCH_EXECNZ %bb.1, implicit $exec
 # W64-LABEL:  bb.2:
@@ -198,7 +198,7 @@ body:             |
 # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc
 # W32: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3
 # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec
-# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec
+# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 # TODO: S_XOR_B32_term should be `implicit-def $scc`
 # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]]
 # W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec
@@ -223,7 +223,7 @@ body:             |
     %1:vgpr_32 = COPY $vgpr1
     %0:vgpr_32 = COPY $vgpr0
     %6:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
-    %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $sgpr30_sgpr31 = COPY %5
     $vgpr0 = COPY %7
     S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
@@ -239,7 +239,7 @@ body:             |
 # ADDR64: %9:vgpr_32, %12:sreg_64_xexec = V_ADD_CO_U32_e64 %14.sub0, %4.sub0, 0, implicit $exec
 # ADDR64: %10:vgpr_32, dead %13:sreg_64_xexec = V_ADDC_U32_e64 %14.sub1, %4.sub1, killed %12, 0, implicit $exec
 # ADDR64: %11:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %10, %subreg.sub1
-# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %11, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %11, killed %18, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 ---
 name:            addr64
 liveins:
@@ -259,7 +259,7 @@ body:             |
     %1:vgpr_32 = COPY $vgpr1
     %0:vgpr_32 = COPY $vgpr0
     %6:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
-    %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %4, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %4, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $sgpr30_sgpr31 = COPY %5
     $vgpr0 = COPY %7
     S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
@@ -284,7 +284,7 @@ body:             |
 # W64-NO-ADDR64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc
 # W64-NO-ADDR64: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3
 # W64-NO-ADDR64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec
-# W64-NO-ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec
+# W64-NO-ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 # W64-NO-ADDR64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc
 # W64-NO-ADDR64: S_CBRANCH_EXECNZ %bb.1, implicit $exec
 # W64-NO-ADDR64-LABEL:  bb.2:
@@ -306,7 +306,7 @@ body:             |
 # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc
 # W32: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3
 # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec
-# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec
+# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 # TODO: S_XOR_B32_term should be `implicit-def $scc`
 # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]]
 # W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec
@@ -320,7 +320,7 @@ body:             |
 # ADDR64: [[RSRCFMTHI:%[0-9]+]]:sgpr_32 = S_MOV_B32 61440
 # ADDR64: [[ZERORSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[ZERO64]], %subreg.sub0_sub1, [[RSRCFMTLO]], %subreg.sub2, [[RSRCFMTHI]], %subreg.sub3
 # ADDR64: [[VADDR64:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[RSRCPTR]].sub0, %subreg.sub0, [[RSRCPTR]].sub1, %subreg.sub1
-# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[VADDR64]], [[ZERORSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec
+# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[VADDR64]], [[ZERORSRC]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
 ---
 name:            offset
@@ -341,7 +341,7 @@ body:             |
     %1:vgpr_32 = COPY $vgpr1
     %0:vgpr_32 = COPY $vgpr0
     %6:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
-    %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $sgpr30_sgpr31 = COPY %5
     $vgpr0 = COPY %7
     S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0

diff  --git a/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir b/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir
index 93873a173569..b3aaffbfed10 100644
--- a/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir
@@ -9,7 +9,7 @@ name:            hazard_image_sample_d_buf_off6
 body:            |
   bb.0:
     $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 undef $vgpr3, undef $vgpr8, undef $vgpr7, undef $vgpr5, undef $vgpr4, undef $vgpr6, undef $vgpr0, undef $vgpr2, undef $vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load 16)
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, 0, implicit $exec
 ...
 
 # GCN-LABEL: name: no_hazard_image_sample_d_buf_off1
@@ -20,7 +20,7 @@ name:            no_hazard_image_sample_d_buf_off1
 body:            |
   bb.0:
     $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 undef $vgpr3, undef $vgpr8, undef $vgpr7, undef $vgpr5, undef $vgpr4, undef $vgpr6, undef $vgpr0, undef $vgpr2, undef $vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load 16)
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 1, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 1, 0, 0, 0, 0, 0, 0, implicit $exec
 ...
 
 # GCN-LABEL: name: no_hazard_image_sample_d_buf_far
@@ -33,7 +33,7 @@ body:            |
   bb.0:
     $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 undef $vgpr3, undef $vgpr8, undef $vgpr7, undef $vgpr5, undef $vgpr4, undef $vgpr6, undef $vgpr0, undef $vgpr2, undef $vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load 16)
     V_NOP_e32 implicit $exec
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, 0, implicit $exec
 ...
 
 # Non-NSA
@@ -45,7 +45,7 @@ name:            no_hazard_image_sample_v4_v2_buf_off6
 body:            |
   bb.0:
     $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2_gfx10 undef $vgpr1_vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load 16)
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, 0, implicit $exec
 ...
 
 # Less than 4 dwords
@@ -57,5 +57,5 @@ name:            no_hazard_image_sample_v4_v3_buf_off6
 body:            |
   bb.0:
     $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V3_nsa_gfx10 undef $vgpr1, undef $vgpr2, undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, 0, implicit $exec
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
index 0e7708210a90..44901a1ed9bb 100644
--- a/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
@@ -137,7 +137,7 @@ body:             |
     %28 = REG_SEQUENCE %6, 17, killed %27, 18
     %29 = V_MOV_B32_e32 0, implicit $exec
     %30 = COPY %24
-    BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.bb2:
     SI_END_CF %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
@@ -243,7 +243,7 @@ body:             |
     %37 = REG_SEQUENCE %6, 17, killed %36, 18
     %38 = V_MOV_B32_e32 0, implicit $exec
     %39 = COPY %33
-    BUFFER_STORE_DWORD_ADDR64 killed %38, killed %39, killed %37, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 killed %38, killed %39, killed %37, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.bb2:
     SI_END_CF %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
@@ -332,7 +332,7 @@ body:             |
     %28 = REG_SEQUENCE %6, 17, killed %27, 18
     %29 = V_MOV_B32_e32 0, implicit $exec
     %30 = COPY %24
-    BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.bb2:
     SI_END_CF %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir b/llvm/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir
index 53ca546969b9..9cbbabc361ab 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir
@@ -151,7 +151,7 @@ body:             |
 
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.end:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -159,7 +159,7 @@ body:             |
     $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -188,7 +188,7 @@ body:             |
 
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.end:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -196,7 +196,7 @@ body:             |
     $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -225,7 +225,7 @@ body:             |
 
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.end:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -233,14 +233,14 @@ body:             |
     $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
 ---
 # CHECK-LABEL: name: optimize_if_and_saveexec_xor_valu_middle
 # CHECK: $sgpr2_sgpr3 = S_AND_B64 $sgpr0_sgpr1, killed $vcc, implicit-def $scc
-# CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+# CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 # CHECK-NEXT: $sgpr0_sgpr1 = S_XOR_B64 $sgpr2_sgpr3, killed $sgpr0_sgpr1, implicit-def $scc
 # CHECK-NEXT: $exec = COPY killed $sgpr2_sgpr3
 # CHECK-NEXT: SI_MASK_BRANCH
@@ -255,7 +255,7 @@ body:             |
     $vcc = V_CMP_EQ_I32_e64 0, killed $vgpr0, implicit $exec
     $vgpr0 = V_MOV_B32_e32 4, implicit $exec
     $sgpr2_sgpr3 = S_AND_B64 $sgpr0_sgpr1, killed $vcc, implicit-def $scc
-    BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $sgpr0_sgpr1 = S_XOR_B64 $sgpr2_sgpr3, killed $sgpr0_sgpr1, implicit-def $scc
     $exec = S_MOV_B64_term killed $sgpr2_sgpr3
     SI_MASK_BRANCH %bb.2, implicit $exec
@@ -266,7 +266,7 @@ body:             |
 
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.end:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -274,7 +274,7 @@ body:             |
     $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -304,7 +304,7 @@ body:             |
 
   bb.1.if:
     liveins: $sgpr0_sgpr1 , $sgpr4_sgpr5_sgpr6_sgpr7
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.end:
     liveins: $vgpr0, $sgpr0_sgpr1, $sgpr4_sgpr5_sgpr6_sgpr7
@@ -312,7 +312,7 @@ body:             |
     $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -346,7 +346,7 @@ body:             |
 
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.end:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -356,7 +356,7 @@ body:             |
     $sgpr1 = S_MOV_B32 1
     $sgpr2 = S_MOV_B32 -1
     $sgpr3 = S_MOV_B32 61440
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -387,7 +387,7 @@ body:             |
     S_SLEEP 0, implicit $sgpr2_sgpr3
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.end:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -395,7 +395,7 @@ body:             |
     $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -426,7 +426,7 @@ body:             |
 
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.end:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -434,7 +434,7 @@ body:             |
     $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -463,7 +463,7 @@ body:             |
 
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.end:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -471,7 +471,7 @@ body:             |
     $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -500,7 +500,7 @@ body:             |
 
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.end:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -508,7 +508,7 @@ body:             |
     $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -539,7 +539,7 @@ body:             |
 
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2.end:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -547,6 +547,6 @@ body:             |
     $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
new file mode 100644
index 000000000000..14d2feccea61
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -0,0 +1,580 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s
+
+; GCN-LABEL: {{^}}fadd_v2_vv:
+; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
+define amdgpu_kernel void @fadd_v2_vv(<2 x float> addrspace(1)* %a) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %add = fadd <2 x float> %load, %load
+  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_v2_vs:
+; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
+define amdgpu_kernel void @fadd_v2_vs(<2 x float> addrspace(1)* %a, <2 x float> %x) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %add = fadd <2 x float> %load, %x
+  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_v4_vs:
+; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
+define amdgpu_kernel void @fadd_v4_vs(<4 x float> addrspace(1)* %a, <4 x float> %x) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i32 %id
+  %load = load <4 x float>, <4 x float> addrspace(1)* %gep, align 16
+  %add = fadd <4 x float> %load, %x
+  store <4 x float> %add, <4 x float> addrspace(1)* %gep, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_v32_vs:
+; GFX900-COUNT-32: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-COUNT-16: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
+define amdgpu_kernel void @fadd_v32_vs(<32 x float> addrspace(1)* %a, <32 x float> %x) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %a, i32 %id
+  %load = load <32 x float>, <32 x float> addrspace(1)* %gep, align 128
+  %add = fadd <32 x float> %load, %x
+  store <32 x float> %add, <32 x float> addrspace(1)* %gep, align 128
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_v2_v_imm:
+; GCN:            s_mov_b32 s[[K:[0-9]+]], 0x42c80000
+; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, s[[K]], v{{[0-9]+}}
+; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s{{\[}}[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
+define amdgpu_kernel void @fadd_v2_v_imm(<2 x float> addrspace(1)* %a) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %add = fadd <2 x float> %load, <float 100.0, float 100.0>
+  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_v2_v_v_splat:
+; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0
+; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}}
+define amdgpu_kernel void @fadd_v2_v_v_splat(<2 x float> addrspace(1)* %a) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %fid = bitcast i32 %id to float
+  %tmp1 = insertelement <2 x float> undef, float %fid, i64 0
+  %k = insertelement <2 x float> %tmp1, float %fid, i64 1
+  %add = fadd <2 x float> %load, %k
+  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_v2_v_lit_splat:
+; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}}
+define amdgpu_kernel void @fadd_v2_v_lit_splat(<2 x float> addrspace(1)* %a) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %add = fadd <2 x float> %load, <float 1.0, float 1.0>
+  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_v2_v_lit_hi0:
+; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; GFX90A-DAG: s_mov_b32 s[[HI:[0-9]+]], 0
+; GFX90A-DAG: s_mov_b32 s[[LO:[0-9]+]], 1.0
+; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s{{\[}}[[LO]]:[[HI]]]{{$}}
+define amdgpu_kernel void @fadd_v2_v_lit_hi0(<2 x float> addrspace(1)* %a) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %add = fadd <2 x float> %load, <float 1.0, float 0.0>
+  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_v2_v_lit_lo0:
+; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; GFX90A-DAG: s_mov_b32 s[[LO:[0-9]+]], 0
+; GFX90A-DAG: s_mov_b32 s[[HI:[0-9]+]], 1.0
+; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s{{\[}}[[LO]]:[[HI]]]{{$}}
+define amdgpu_kernel void @fadd_v2_v_lit_lo0(<2 x float> addrspace(1)* %a) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %add = fadd <2 x float> %load, <float 0.0, float 1.0>
+  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_v2_v_unfoldable_lit:
+; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
+; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 1.0
+; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 2.0
+; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
+define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(<2 x float> addrspace(1)* %a) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %add = fadd <2 x float> %load, <float 1.0, float 2.0>
+  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_v2_v_fneg:
+; GFX900-COUNT-2: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
+define amdgpu_kernel void @fadd_v2_v_fneg(<2 x float> addrspace(1)* %a, float %x) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %fneg = fsub float -0.0, %x
+  %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0
+  %k = insertelement <2 x float> %tmp1, float %fneg, i64 1
+  %add = fadd <2 x float> %load, %k
+  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_v2_v_fneg_lo:
+; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1]{{$}}
+define amdgpu_kernel void @fadd_v2_v_fneg_lo(<2 x float> addrspace(1)* %a, float %x) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %fneg = fsub float -0.0, %x
+  %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0
+  %k = insertelement <2 x float> %tmp1, float %x, i64 1
+  %add = fadd <2 x float> %load, %k
+  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_v2_v_fneg_hi:
+; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_hi:[0,1]{{$}}
+define amdgpu_kernel void @fadd_v2_v_fneg_hi(<2 x float> addrspace(1)* %a, float %x) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %fneg = fsub float -0.0, %x
+  %tmp1 = insertelement <2 x float> undef, float %x, i64 0
+  %k = insertelement <2 x float> %tmp1, float %fneg, i64 1
+  %add = fadd <2 x float> %load, %k
+  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_v2_v_fneg_lo2:
+; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] neg_lo:[0,1]{{$}}
+define amdgpu_kernel void @fadd_v2_v_fneg_lo2(<2 x float> addrspace(1)* %a, float %x, float %y) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %fneg = fsub float -0.0, %x
+  %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0
+  %k = insertelement <2 x float> %tmp1, float %y, i64 1
+  %add = fadd <2 x float> %load, %k
+  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_v2_v_fneg_hi2:
+; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1]{{$}}
+define amdgpu_kernel void @fadd_v2_v_fneg_hi2(<2 x float> addrspace(1)* %a, float %x, float %y) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %fneg = fsub float -0.0, %x
+  %tmp1 = insertelement <2 x float> undef, float %y, i64 0
+  %k = insertelement <2 x float> %tmp1, float %fneg, i64 1
+  %add = fadd <2 x float> %load, %k
+  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fmul_v2_vv:
+; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
+define amdgpu_kernel void @fmul_v2_vv(<2 x float> addrspace(1)* %a) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %mul = fmul <2 x float> %load, %load
+  store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fmul_v2_vs:
+; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
+define amdgpu_kernel void @fmul_v2_vs(<2 x float> addrspace(1)* %a, <2 x float> %x) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %mul = fmul <2 x float> %load, %x
+  store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fmul_v4_vs:
+; GFX900-COUNT-4: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-COUNT-2: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
+define amdgpu_kernel void @fmul_v4_vs(<4 x float> addrspace(1)* %a, <4 x float> %x) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i32 %id
+  %load = load <4 x float>, <4 x float> addrspace(1)* %gep, align 16
+  %mul = fmul <4 x float> %load, %x
+  store <4 x float> %mul, <4 x float> addrspace(1)* %gep, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}fmul_v32_vs:
+; GFX900-COUNT-32: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-COUNT-16: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
+define amdgpu_kernel void @fmul_v32_vs(<32 x float> addrspace(1)* %a, <32 x float> %x) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %a, i32 %id
+  %load = load <32 x float>, <32 x float> addrspace(1)* %gep, align 128
+  %mul = fmul <32 x float> %load, %x
+  store <32 x float> %mul, <32 x float> addrspace(1)* %gep, align 128
+  ret void
+}
+
+; GCN-LABEL: {{^}}fmul_v2_v_imm:
+; GCN:            s_mov_b32 s[[K:[0-9]+]], 0x42c80000
+; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, s[[K]], v{{[0-9]+}}
+; GFX90A:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s{{\[}}[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
+define amdgpu_kernel void @fmul_v2_v_imm(<2 x float> addrspace(1)* %a) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %mul = fmul <2 x float> %load, <float 100.0, float 100.0>
+  store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fmul_v2_v_v_splat:
+; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0
+; GFX90A:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}}
+define amdgpu_kernel void @fmul_v2_v_v_splat(<2 x float> addrspace(1)* %a) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %fid = bitcast i32 %id to float
+  %tmp1 = insertelement <2 x float> undef, float %fid, i64 0
+  %k = insertelement <2 x float> %tmp1, float %fid, i64 1
+  %mul = fmul <2 x float> %load, %k
+  store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fmul_v2_v_lit_splat:
+; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
+; GFX90A:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}}
+define amdgpu_kernel void @fmul_v2_v_lit_splat(<2 x float> addrspace(1)* %a) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %mul = fmul <2 x float> %load, <float 4.0, float 4.0>
+  store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fmul_v2_v_unfoldable_lit:
+; GFX900-DAG: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
+; GFX900-DAG: v_mul_f32_e32 v{{[0-9]+}}, 0x40400000, v{{[0-9]+}}
+; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 4.0
+; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 0x40400000
+; GFX90A:     v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
+define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(<2 x float> addrspace(1)* %a) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %mul = fmul <2 x float> %load, <float 4.0, float 3.0>
+  store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fmul_v2_v_fneg:
+; GFX900-COUNT-2: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -s{{[0-9]+}}
+; GFX90A:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
+define amdgpu_kernel void @fmul_v2_v_fneg(<2 x float> addrspace(1)* %a, float %x) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %fneg = fsub float -0.0, %x
+  %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0
+  %k = insertelement <2 x float> %tmp1, float %fneg, i64 1
+  %mul = fmul <2 x float> %load, %k
+  store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_v2_vv:
+; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
+define amdgpu_kernel void @fma_v2_vv(<2 x float> addrspace(1)* %a) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %load, <2 x float> %load)
+  store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_v2_vs:
+; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
+define amdgpu_kernel void @fma_v2_vs(<2 x float> addrspace(1)* %a, <2 x float> %x) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %x, <2 x float> %x)
+  store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_v4_vs:
+; GFX900-COUNT-4: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GFX90A-COUNT-2: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
+define amdgpu_kernel void @fma_v4_vs(<4 x float> addrspace(1)* %a, <4 x float> %x) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i32 %id
+  %load = load <4 x float>, <4 x float> addrspace(1)* %gep, align 16
+  %fma = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %load, <4 x float> %x, <4 x float> %x)
+  store <4 x float> %fma, <4 x float> addrspace(1)* %gep, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_v32_vs:
+; GFX900-COUNT-32: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GFX90A-COUNT-16: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
+define amdgpu_kernel void @fma_v32_vs(<32 x float> addrspace(1)* %a, <32 x float> %x) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %a, i32 %id
+  %load = load <32 x float>, <32 x float> addrspace(1)* %gep, align 128
+  %fma = tail call <32 x float> @llvm.fma.v32f32(<32 x float> %load, <32 x float> %x, <32 x float> %x)
+  store <32 x float> %fma, <32 x float> addrspace(1)* %gep, align 128
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_v2_v_imm:
+; GCN-DAG:        s_mov_b32 s[[K1:[0-9]+]], 0x42c80000
+; GCN-DAG:        v_mov_b32_e32 v[[K2:[0-9]+]], 0x43480000
+; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[[K1]], v[[K2]]
+; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s{{\[}}[[K1]]:{{[0-9:]+}}], v{{\[}}[[K2]]:{{[0-9:]+}}] op_sel_hi:[1,0,0]{{$}}
+define amdgpu_kernel void @fma_v2_v_imm(<2 x float> addrspace(1)* %a) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> <float 100.0, float 100.0>, <2 x float> <float 200.0, float 200.0>)
+  store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_v2_v_v_splat:
+; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v0, v0
+; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1], v[0:1] op_sel_hi:[1,0,0]{{$}}
+define amdgpu_kernel void @fma_v2_v_v_splat(<2 x float> addrspace(1)* %a) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %fid = bitcast i32 %id to float
+  %tmp1 = insertelement <2 x float> undef, float %fid, i64 0
+  %k = insertelement <2 x float> %tmp1, float %fid, i64 1
+  %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %k, <2 x float> %k)
+  store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_v2_v_lit_splat:
+; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0
+; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0 op_sel_hi:[1,0,0]{{$}}
+define amdgpu_kernel void @fma_v2_v_lit_splat(<2 x float> addrspace(1)* %a) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> <float 4.0, float 4.0>, <2 x float> <float 1.0, float 1.0>)
+  store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_v2_v_unfoldable_lit:
+; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, 0x40400000
+; GFX900-DAG: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0
+; GFX900-DAG: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, 2.0
+; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 4.0
+; GFX90A-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0
+; GFX90A-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
+; GFX90A:     v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
+define amdgpu_kernel void @fma_v2_v_unfoldable_lit(<2 x float> addrspace(1)* %a) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> <float 4.0, float 3.0>, <2 x float> <float 1.0, float 2.0>)
+  store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_v2_v_fneg:
+; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -s{{[0-9]+}}, -s{{[0-9]+}}
+; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0,0] neg_lo:[0,1,1] neg_hi:[0,1,1]{{$}}
+define amdgpu_kernel void @fma_v2_v_fneg(<2 x float> addrspace(1)* %a, float %x) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %fneg = fsub float -0.0, %x
+  %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0
+  %k = insertelement <2 x float> %tmp1, float %fneg, i64 1
+  %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %k, <2 x float> %k)
+  store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_vector_neg_bitcast_scalar_lo:
+; GFX900-COUNT-2: v_sub_f32_e32
+; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
+define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %lds, float addrspace(3)* %arg2) {
+bb:
+  %vec0 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds, align 4
+  %scalar0 = load volatile float, float addrspace(3)* %arg2, align 4
+  %neg.scalar0 = fsub float -0.0, %scalar0
+
+  %neg.scalar0.vec = insertelement <2 x float> undef, float %neg.scalar0, i32 0
+  %neg.scalar0.broadcast = shufflevector <2 x float> %neg.scalar0.vec, <2 x float> undef, <2 x i32> zeroinitializer
+
+  %result = fadd <2 x float> %vec0, %neg.scalar0.broadcast
+  store <2 x float> %result, <2 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi:
+; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] neg_lo:[0,0,1] neg_hi:[0,0,1]
+define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %lds, float addrspace(3)* %arg2) {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(3)* %lds, i32 1
+  %arg2.gep = getelementptr inbounds float, float addrspace(3)* %arg2, i32 2
+
+  %vec0 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds.gep1, align 4
+
+  %scalar0 = load volatile float, float addrspace(3)* %arg2, align 4
+  %scalar1 = load volatile float, float addrspace(3)* %arg2.gep, align 4
+
+  %vec.ins0 = insertelement <2 x float> undef, float %scalar0, i32 0
+  %vec2 = insertelement <2 x float> %vec.ins0, float %scalar1, i32 1
+  %neg.vec2 = fsub <2 x float> <float -0.0, float -0.0>, %vec2
+
+  %result = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %vec0, <2 x float> %vec1, <2 x float> %neg.vec2)
+  store <2 x float> %result, <2 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}shuffle_add_f32:
+; GFX900-COUNT-2: v_add_f32_e32
+; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0]{{$}}
+define amdgpu_kernel void @shuffle_add_f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %lds) #0 {
+bb:
+  %vec0 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds, align 8
+  %lds.gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(3)* %lds, i32 1
+  %vec1 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds.gep1, align 8
+  %vec1.swap = shufflevector <2 x float> %vec1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+  %result = fadd <2 x float> %vec0, %vec1.swap
+  store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}shuffle_neg_add_f32:
+; GFX900-COUNT-2: v_sub_f32_e32
+; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
+define amdgpu_kernel void @shuffle_neg_add_f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %lds) #0 {
+bb:
+  %vec0 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds, align 8
+  %lds.gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(3)* %lds, i32 1
+  %f32 = load volatile float, float addrspace(3)* undef, align 8
+  %vec1 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds.gep1, align 8
+  %vec1.neg = fsub <2 x float> <float -0.0, float -0.0>, %vec1
+  %vec1.neg.swap = shufflevector <2 x float> %vec1.neg, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+  %result = fadd <2 x float> %vec0, %vec1.neg.swap
+  store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_fadd_fsub:
+; GFX900: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0
+; GFX900: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 op_sel_hi:[1,0]
+; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 op_sel_hi:[1,0]
+define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg) {
+bb:
+  %i12 = fadd <2 x float> zeroinitializer, %arg
+  %shift8 = shufflevector <2 x float> %i12, <2 x float> undef, <2 x i32> <i32 1, i32 undef>
+  %i13 = fadd <2 x float> zeroinitializer, %shift8
+  %i14 = shufflevector <2 x float> %arg, <2 x float> %i13, <2 x i32> <i32 0, i32 2>
+  %i15 = fsub <2 x float> %i14, zeroinitializer
+  store <2 x float> %i15, <2 x float>* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_shuffle_v4:
+; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]
+define amdgpu_kernel void @fadd_shuffle_v4(<4 x float> addrspace(1)* %arg) {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid
+  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %gep
+  %shuf = shufflevector <4 x float> %in.1, <4 x float> undef, <4 x i32> zeroinitializer
+  %add.1 = fadd <4 x float> %in.1, %shuf
+  store <4 x float> %add.1, <4 x float> addrspace(1)* %gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_v2f32_vec:
+; GFX900:         s_brev_b32 [[SIGN:s[0-9]+]], 1
+; GFX900-COUNT-2: v_xor_b32_e32 v{{[0-9]+}}, [[SIGN]], v{{[0-9]+}}
+; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 neg_lo:[1,1] neg_hi:[1,1]{{$}}
+define amdgpu_kernel void @fneg_v2f32_vec(<2 x float> addrspace(1)* %a) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
+  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %fneg = fsub <2 x float> <float -0.0, float -0.0>, %load
+  store <2 x float> %fneg, <2 x float> addrspace(1)* %gep, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_v2f32_scalar:
+; GCN:         s_brev_b32 [[SIGN:s[0-9]+]], 1
+; GCN-COUNT-2: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGN]]
+define amdgpu_kernel void @fneg_v2f32_scalar(<2 x float> addrspace(1)* %a, <2 x float> %x) {
+  %fneg = fsub <2 x float> <float -0.0, float -0.0>, %x
+  store <2 x float> %fneg, <2 x float> addrspace(1)* %a, align 8
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <32 x float> @llvm.fma.v32f32(<32 x float>, <32 x float>, <32 x float>)

diff  --git a/llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir b/llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir
index 7ccba04c9d1a..5b5ab2c106dd 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir
@@ -59,17 +59,17 @@ body:             |
     ; MUBUF-V2A: liveins: $agpr0
     ; MUBUF-V2A: $vgpr0_vgpr1 = IMPLICIT_DEF
     ; MUBUF-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
     ; MUBUF-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1
-    ; MUBUF-V2A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF-V2A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load 4 from %stack.0 + 4, addrspace 5)
     ; MUBUF-V2A: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_v2_partial_agpr
     ; FLATSCR-V2A: liveins: $agpr0
     ; FLATSCR-V2A: $vgpr0_vgpr1 = IMPLICIT_DEF
     ; FLATSCR-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1
-    ; FLATSCR-V2A: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; FLATSCR-V2A: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
     ; FLATSCR-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1
-    ; FLATSCR-V2A: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; FLATSCR-V2A: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1 :: (load 4 from %stack.0 + 4, addrspace 5)
     ; FLATSCR-V2A: S_ENDPGM 0
     $vgpr0_vgpr1 = IMPLICIT_DEF
     SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5)
@@ -93,19 +93,19 @@ body:             |
     ; MUBUF-V2A: liveins: $agpr0
     ; MUBUF-V2A: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
     ; MUBUF-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0 + 4, addrspace 5)
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0 + 8, addrspace 5)
     ; MUBUF-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2
-    ; MUBUF-V2A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0 + 4, addrspace 5)
-    ; MUBUF-V2A: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF-V2A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF-V2A: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0 + 8, addrspace 5)
     ; MUBUF-V2A: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_v3_partial_agpr
     ; FLATSCR-V2A: liveins: $agpr0
     ; FLATSCR-V2A: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
     ; FLATSCR-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2
-    ; FLATSCR-V2A: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr1_vgpr2, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2 :: (store 8 into %stack.0 + 4, align 4, addrspace 5)
+    ; FLATSCR-V2A: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr1_vgpr2, $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2 :: (store 8 into %stack.0 + 4, align 4, addrspace 5)
     ; FLATSCR-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2
-    ; FLATSCR-V2A: $vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 8 from %stack.0 + 4, align 4, addrspace 5)
+    ; FLATSCR-V2A: $vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 8 from %stack.0 + 4, align 4, addrspace 5)
     ; FLATSCR-V2A: S_ENDPGM 0
     $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
     SI_SPILL_V96_SAVE killed $vgpr0_vgpr1_vgpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store 12 into %stack.0, align 4, addrspace 5)
@@ -131,11 +131,11 @@ body:             |
     ; MUBUF-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; MUBUF-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; MUBUF-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5)
     ; MUBUF-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
     ; MUBUF-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
     ; MUBUF-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
-    ; MUBUF-V2A: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF-V2A: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 12, addrspace 5)
     ; MUBUF-V2A: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_v4_partial_agpr
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2
@@ -143,11 +143,11 @@ body:             |
     ; FLATSCR-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; FLATSCR-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; FLATSCR-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
-    ; FLATSCR-V2A: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; FLATSCR-V2A: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5)
     ; FLATSCR-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
     ; FLATSCR-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
     ; FLATSCR-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
-    ; FLATSCR-V2A: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; FLATSCR-V2A: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 12, addrspace 5)
     ; FLATSCR-V2A: S_ENDPGM 0
     $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
     SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5)
@@ -173,13 +173,13 @@ body:             |
     ; MUBUF-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; MUBUF-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; MUBUF-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 12, addrspace 5)
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 16, addrspace 5)
     ; MUBUF-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; MUBUF-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; MUBUF-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
-    ; MUBUF-V2A: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 12, addrspace 5)
-    ; MUBUF-V2A: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF-V2A: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF-V2A: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 16, addrspace 5)
     ; MUBUF-V2A: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_v5_partial_agpr
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2
@@ -187,13 +187,13 @@ body:             |
     ; FLATSCR-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; FLATSCR-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; FLATSCR-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
-    ; FLATSCR-V2A: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 12, addrspace 5)
-    ; FLATSCR-V2A: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; FLATSCR-V2A: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; FLATSCR-V2A: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 16, addrspace 5)
     ; FLATSCR-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; FLATSCR-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; FLATSCR-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
-    ; FLATSCR-V2A: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 12, addrspace 5)
-    ; FLATSCR-V2A: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; FLATSCR-V2A: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; FLATSCR-V2A: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 16, addrspace 5)
     ; FLATSCR-V2A: S_ENDPGM 0
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF
     SI_SPILL_V160_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store 20 into %stack.0, align 4, addrspace 5)
@@ -221,13 +221,13 @@ body:             |
     ; MUBUF-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; MUBUF-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; MUBUF-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 20, addrspace 5)
     ; MUBUF-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; MUBUF-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; MUBUF-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; MUBUF-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; MUBUF-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
-    ; MUBUF-V2A: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; MUBUF-V2A: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 20, addrspace 5)
     ; MUBUF-V2A: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_v6_partial_agpr
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
@@ -237,13 +237,13 @@ body:             |
     ; FLATSCR-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; FLATSCR-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; FLATSCR-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
-    ; FLATSCR-V2A: SCRATCH_STORE_DWORD_SADDR killed $vgpr5, $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; FLATSCR-V2A: SCRATCH_STORE_DWORD_SADDR killed $vgpr5, $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 20, addrspace 5)
     ; FLATSCR-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; FLATSCR-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; FLATSCR-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; FLATSCR-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; FLATSCR-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
-    ; FLATSCR-V2A: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; FLATSCR-V2A: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 20, addrspace 5)
     ; FLATSCR-V2A: S_ENDPGM 0
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF
     SI_SPILL_V192_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store 24 into %stack.0, align 4, addrspace 5)
@@ -270,18 +270,18 @@ body:             |
     ; MUBUF-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; MUBUF-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; MUBUF-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 16, addrspace 5)
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 20, addrspace 5)
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 24, addrspace 5)
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 28, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 24, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 28, addrspace 5)
     ; MUBUF-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; MUBUF-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; MUBUF-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; MUBUF-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-    ; MUBUF-V2A: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 16, addrspace 5)
-    ; MUBUF-V2A: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 20, addrspace 5)
-    ; MUBUF-V2A: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 24, addrspace 5)
-    ; MUBUF-V2A: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 28, addrspace 5)
+    ; MUBUF-V2A: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF-V2A: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; MUBUF-V2A: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 24, addrspace 5)
+    ; MUBUF-V2A: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 28, addrspace 5)
     ; MUBUF-V2A: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_v8_partial_agpr
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
@@ -290,12 +290,12 @@ body:             |
     ; FLATSCR-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; FLATSCR-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; FLATSCR-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-    ; FLATSCR-V2A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 16 into %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR-V2A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 16 into %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; FLATSCR-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; FLATSCR-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; FLATSCR-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-    ; FLATSCR-V2A: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 16 from %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR-V2A: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 16 from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-V2A: S_ENDPGM 0
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
     SI_SPILL_V256_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store 32 into %stack.0, align 4, addrspace 5)
@@ -323,33 +323,33 @@ body:             |
     ; MUBUF-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; MUBUF-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; MUBUF-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 20, addrspace 5)
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 24, addrspace 5)
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 28, addrspace 5)
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 32, addrspace 5)
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 36, addrspace 5)
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 40, addrspace 5)
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 44, addrspace 5)
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr12, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 48, addrspace 5)
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr13, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 52, addrspace 5)
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 56, addrspace 5)
-    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 60, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 24, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 28, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 32, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 36, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 40, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 44, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr12, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 48, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr13, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 52, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 56, addrspace 5)
+    ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 60, addrspace 5)
     ; MUBUF-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; MUBUF-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; MUBUF-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; MUBUF-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; MUBUF-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; MUBUF-V2A: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 20, addrspace 5)
-    ; MUBUF-V2A: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 24, addrspace 5)
-    ; MUBUF-V2A: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 28, addrspace 5)
-    ; MUBUF-V2A: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 32, addrspace 5)
-    ; MUBUF-V2A: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 36, addrspace 5)
-    ; MUBUF-V2A: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 40, addrspace 5)
-    ; MUBUF-V2A: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 44, addrspace 5)
-    ; MUBUF-V2A: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 48, addrspace 5)
-    ; MUBUF-V2A: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 52, addrspace 5)
-    ; MUBUF-V2A: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 56, addrspace 5)
-    ; MUBUF-V2A: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 60, addrspace 5)
+    ; MUBUF-V2A: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; MUBUF-V2A: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 24, addrspace 5)
+    ; MUBUF-V2A: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 28, addrspace 5)
+    ; MUBUF-V2A: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 32, addrspace 5)
+    ; MUBUF-V2A: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 36, addrspace 5)
+    ; MUBUF-V2A: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 40, addrspace 5)
+    ; MUBUF-V2A: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 44, addrspace 5)
+    ; MUBUF-V2A: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 48, addrspace 5)
+    ; MUBUF-V2A: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 52, addrspace 5)
+    ; MUBUF-V2A: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 56, addrspace 5)
+    ; MUBUF-V2A: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 60, addrspace 5)
     ; MUBUF-V2A: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_v16_partial_agpr
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
@@ -359,17 +359,17 @@ body:             |
     ; FLATSCR-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; FLATSCR-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; FLATSCR-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; FLATSCR-V2A: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr5_vgpr6_vgpr7, $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 12 into %stack.0 + 20, align 4, addrspace 5)
-    ; FLATSCR-V2A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr32, 32, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 32, align 4, addrspace 5)
-    ; FLATSCR-V2A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr12_vgpr13_vgpr14_vgpr15, $sgpr32, 48, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 48, align 4, addrspace 5)
+    ; FLATSCR-V2A: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr5_vgpr6_vgpr7, $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 12 into %stack.0 + 20, align 4, addrspace 5)
+    ; FLATSCR-V2A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 32, align 4, addrspace 5)
+    ; FLATSCR-V2A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr12_vgpr13_vgpr14_vgpr15, $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 48, align 4, addrspace 5)
     ; FLATSCR-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; FLATSCR-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; FLATSCR-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; FLATSCR-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; FLATSCR-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; FLATSCR-V2A: $vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 12 from %stack.0 + 20, align 4, addrspace 5)
-    ; FLATSCR-V2A: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 32, align 4, addrspace 5)
-    ; FLATSCR-V2A: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 48, align 4, addrspace 5)
+    ; FLATSCR-V2A: $vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 12 from %stack.0 + 20, align 4, addrspace 5)
+    ; FLATSCR-V2A: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 32, align 4, addrspace 5)
+    ; FLATSCR-V2A: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 48, align 4, addrspace 5)
     ; FLATSCR-V2A: S_ENDPGM 0
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
     SI_SPILL_V512_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store 64 into %stack.0, align 4, addrspace 5)

diff  --git a/llvm/test/CodeGen/AMDGPU/pei-build-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-build-spill.mir
index d34621da9abb..1851aadb719e 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-build-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-build-spill.mir
@@ -3,6 +3,10 @@
 # RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=MUBUF-V2A %s
 # RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR %s
 # RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-V2A %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF-GFX90A %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=MUBUF-GFX90A-V2A %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-GFX90A %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-GFX90A-V2A %s
 
 ---
 name:            test_spill_v1
@@ -17,8 +21,8 @@ body:             |
   bb.0.entry:
     ; MUBUF-LABEL: name: test_spill_v1
     ; MUBUF: $vgpr0 = IMPLICIT_DEF
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
     ; MUBUF: S_ENDPGM 0
     ; MUBUF-V2A-LABEL: name: test_spill_v1
     ; MUBUF-V2A: liveins: $agpr0
@@ -28,8 +32,8 @@ body:             |
     ; MUBUF-V2A: S_ENDPGM 0
     ; FLATSCR-LABEL: name: test_spill_v1
     ; FLATSCR: $vgpr0 = IMPLICIT_DEF
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5)
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
     ; FLATSCR: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_v1
     ; FLATSCR-V2A: liveins: $agpr0
@@ -37,6 +41,28 @@ body:             |
     ; FLATSCR-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; FLATSCR-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
     ; FLATSCR-V2A: S_ENDPGM 0
+    ; MUBUF-GFX90A-LABEL: name: test_spill_v1
+    ; MUBUF-GFX90A: $vgpr0 = IMPLICIT_DEF
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: S_ENDPGM 0
+    ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v1
+    ; MUBUF-GFX90A-V2A: liveins: $agpr0
+    ; MUBUF-GFX90A-V2A: $vgpr0 = IMPLICIT_DEF
+    ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
+    ; MUBUF-GFX90A-V2A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-LABEL: name: test_spill_v1
+    ; FLATSCR-GFX90A: $vgpr0 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+    ; FLATSCR-GFX90A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v1
+    ; FLATSCR-GFX90A-V2A: liveins: $agpr0
+    ; FLATSCR-GFX90A-V2A: $vgpr0 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
+    ; FLATSCR-GFX90A-V2A: S_ENDPGM 0
     $vgpr0 = IMPLICIT_DEF
     SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, align 4, addrspace 5)
     $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5)
@@ -57,10 +83,10 @@ body:             |
   bb.0.entry:
     ; MUBUF-LABEL: name: test_spill_v2
     ; MUBUF: $vgpr0_vgpr1 = IMPLICIT_DEF
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load 4 from %stack.0, addrspace 5)
-    ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load 4 from %stack.0 + 4, addrspace 5)
     ; MUBUF: S_ENDPGM 0
     ; MUBUF-V2A-LABEL: name: test_spill_v2
     ; MUBUF-V2A: liveins: $agpr0, $agpr1
@@ -72,8 +98,8 @@ body:             |
     ; MUBUF-V2A: S_ENDPGM 0
     ; FLATSCR-LABEL: name: test_spill_v2
     ; FLATSCR: $vgpr0_vgpr1 = IMPLICIT_DEF
-    ; FLATSCR: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8 into %stack.0, align 4, addrspace 5)
-    ; FLATSCR: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8 from %stack.0, align 4, addrspace 5)
     ; FLATSCR: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_v2
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1
@@ -83,6 +109,34 @@ body:             |
     ; FLATSCR-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1
     ; FLATSCR-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1
     ; FLATSCR-V2A: S_ENDPGM 0
+    ; MUBUF-GFX90A-LABEL: name: test_spill_v2
+    ; MUBUF-GFX90A: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: S_ENDPGM 0
+    ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v2
+    ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1
+    ; MUBUF-GFX90A-V2A: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1
+    ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
+    ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1
+    ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1
+    ; MUBUF-GFX90A-V2A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-LABEL: name: test_spill_v2
+    ; FLATSCR-GFX90A: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v2
+    ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1
+    ; FLATSCR-GFX90A-V2A: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1
+    ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
+    ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1
+    ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1
+    ; FLATSCR-GFX90A-V2A: S_ENDPGM 0
     $vgpr0_vgpr1 = IMPLICIT_DEF
     SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5)
     $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5)
@@ -103,12 +157,12 @@ body:             |
   bb.0.entry:
     ; MUBUF-LABEL: name: test_spill_v3
     ; MUBUF: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0 + 4, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0 + 8, addrspace 5)
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0, addrspace 5)
-    ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0 + 4, addrspace 5)
-    ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0 + 8, addrspace 5)
     ; MUBUF: S_ENDPGM 0
     ; MUBUF-V2A-LABEL: name: test_spill_v3
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2
@@ -122,8 +176,8 @@ body:             |
     ; MUBUF-V2A: S_ENDPGM 0
     ; FLATSCR-LABEL: name: test_spill_v3
     ; FLATSCR: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
-    ; FLATSCR: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr0_vgpr1_vgpr2, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12 into %stack.0, align 4, addrspace 5)
-    ; FLATSCR: $vgpr0_vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr0_vgpr1_vgpr2, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR: $vgpr0_vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12 from %stack.0, align 4, addrspace 5)
     ; FLATSCR: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_v3
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2
@@ -135,6 +189,40 @@ body:             |
     ; FLATSCR-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2
     ; FLATSCR-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2
     ; FLATSCR-V2A: S_ENDPGM 0
+    ; MUBUF-GFX90A-LABEL: name: test_spill_v3
+    ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: S_ENDPGM 0
+    ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v3
+    ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2
+    ; MUBUF-GFX90A-V2A: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
+    ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2
+    ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
+    ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2
+    ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2
+    ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2
+    ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2
+    ; MUBUF-GFX90A-V2A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-LABEL: name: test_spill_v3
+    ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr0_vgpr1_vgpr2, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v3
+    ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2
+    ; FLATSCR-GFX90A-V2A: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2
+    ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
+    ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2
+    ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2
+    ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2
+    ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2
+    ; FLATSCR-GFX90A-V2A: S_ENDPGM 0
     $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
     SI_SPILL_V96_SAVE killed $vgpr0_vgpr1_vgpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store 12 into %stack.0, align 4, addrspace 5)
     $vgpr0_vgpr1_vgpr2 = SI_SPILL_V96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 12 from %stack.0, align 4, addrspace 5)
@@ -155,14 +243,14 @@ body:             |
   bb.0.entry:
     ; MUBUF-LABEL: name: test_spill_v4
     ; MUBUF: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 4, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 8, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5)
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0, addrspace 5)
-    ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 4, addrspace 5)
-    ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 8, addrspace 5)
-    ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 12, addrspace 5)
     ; MUBUF: S_ENDPGM 0
     ; MUBUF-V2A-LABEL: name: test_spill_v4
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
@@ -178,8 +266,8 @@ body:             |
     ; MUBUF-V2A: S_ENDPGM 0
     ; FLATSCR-LABEL: name: test_spill_v4
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
-    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16 into %stack.0, align 4, addrspace 5)
-    ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %stack.0, align 4, addrspace 5)
     ; FLATSCR: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_v4
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
@@ -193,6 +281,46 @@ body:             |
     ; FLATSCR-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
     ; FLATSCR-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
     ; FLATSCR-V2A: S_ENDPGM 0
+    ; MUBUF-GFX90A-LABEL: name: test_spill_v4
+    ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: S_ENDPGM 0
+    ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v4
+    ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
+    ; MUBUF-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3
+    ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
+    ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
+    ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
+    ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
+    ; MUBUF-GFX90A-V2A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-LABEL: name: test_spill_v4
+    ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v4
+    ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
+    ; FLATSCR-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3
+    ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
+    ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
+    ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
+    ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
+    ; FLATSCR-GFX90A-V2A: S_ENDPGM 0
     $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
     SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 16 from %stack.0, align 4, addrspace 5)
@@ -213,16 +341,16 @@ body:             |
   bb.0.entry:
     ; MUBUF-LABEL: name: test_spill_v5
     ; MUBUF: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 4, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 8, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 12, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 16, addrspace 5)
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0, addrspace 5)
-    ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 4, addrspace 5)
-    ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 8, addrspace 5)
-    ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 12, addrspace 5)
-    ; MUBUF: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 16, addrspace 5)
     ; MUBUF: S_ENDPGM 0
     ; MUBUF-V2A-LABEL: name: test_spill_v5
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
@@ -240,10 +368,10 @@ body:             |
     ; MUBUF-V2A: S_ENDPGM 0
     ; FLATSCR-LABEL: name: test_spill_v5
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF
-    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 16 into %stack.0, align 4, addrspace 5)
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 16, addrspace 5)
-    ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 16 from %stack.0, align 4, addrspace 5)
-    ; FLATSCR: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 16 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 16 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 16, addrspace 5)
     ; FLATSCR: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_v5
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
@@ -259,6 +387,54 @@ body:             |
     ; FLATSCR-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; FLATSCR-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; FLATSCR-V2A: S_ENDPGM 0
+    ; MUBUF-GFX90A-LABEL: name: test_spill_v5
+    ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF-GFX90A: S_ENDPGM 0
+    ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v5
+    ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
+    ; MUBUF-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF
+    ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
+    ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
+    ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
+    ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
+    ; MUBUF-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
+    ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
+    ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
+    ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
+    ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
+    ; MUBUF-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
+    ; MUBUF-GFX90A-V2A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-LABEL: name: test_spill_v5
+    ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 16 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 16 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; FLATSCR-GFX90A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v5
+    ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
+    ; FLATSCR-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
+    ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
+    ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
+    ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
+    ; FLATSCR-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
+    ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
+    ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
+    ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
+    ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
+    ; FLATSCR-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
+    ; FLATSCR-GFX90A-V2A: S_ENDPGM 0
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF
     SI_SPILL_V160_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store 20 into %stack.0, align 4, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = SI_SPILL_V160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 20 from %stack.0, align 4, addrspace 5)
@@ -279,18 +455,18 @@ body:             |
   bb.0.entry:
     ; MUBUF-LABEL: name: test_spill_v6
     ; MUBUF: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 4, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 8, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 12, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 16, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 20, addrspace 5)
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0, addrspace 5)
-    ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 4, addrspace 5)
-    ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 8, addrspace 5)
-    ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 12, addrspace 5)
-    ; MUBUF: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 16, addrspace 5)
-    ; MUBUF: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 20, addrspace 5)
     ; MUBUF: S_ENDPGM 0
     ; MUBUF-V2A-LABEL: name: test_spill_v6
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5
@@ -310,10 +486,10 @@ body:             |
     ; MUBUF-V2A: S_ENDPGM 0
     ; FLATSCR-LABEL: name: test_spill_v6
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF
-    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 16 into %stack.0, align 4, addrspace 5)
-    ; FLATSCR: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr4_vgpr5, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 8 into %stack.0 + 16, align 4, addrspace 5)
-    ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 16 from %stack.0, align 4, addrspace 5)
-    ; FLATSCR: $vgpr4_vgpr5 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 8 from %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 16 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr4_vgpr5, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 8 into %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 16 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR: $vgpr4_vgpr5 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 8 from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_v6
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5
@@ -331,6 +507,60 @@ body:             |
     ; FLATSCR-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; FLATSCR-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; FLATSCR-V2A: S_ENDPGM 0
+    ; MUBUF-GFX90A-LABEL: name: test_spill_v6
+    ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; MUBUF-GFX90A: S_ENDPGM 0
+    ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v6
+    ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5
+    ; MUBUF-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF
+    ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; MUBUF-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; MUBUF-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; MUBUF-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; MUBUF-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; MUBUF-GFX90A-V2A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-LABEL: name: test_spill_v6
+    ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 16 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr4_vgpr5, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 8 into %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 16 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr4_vgpr5 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 8 from %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v6
+    ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5
+    ; FLATSCR-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; FLATSCR-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; FLATSCR-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; FLATSCR-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; FLATSCR-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+    ; FLATSCR-GFX90A-V2A: S_ENDPGM 0
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF
     SI_SPILL_V192_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store 24 into %stack.0, align 4, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 24 from %stack.0, align 4, addrspace 5)
@@ -351,22 +581,22 @@ body:             |
   bb.0.entry:
     ; MUBUF-LABEL: name: test_spill_v8
     ; MUBUF: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 4, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 8, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 12, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 16, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 20, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 24, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 28, addrspace 5)
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0, addrspace 5)
-    ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 4, addrspace 5)
-    ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 8, addrspace 5)
-    ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 12, addrspace 5)
-    ; MUBUF: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 16, addrspace 5)
-    ; MUBUF: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 20, addrspace 5)
-    ; MUBUF: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 24, addrspace 5)
-    ; MUBUF: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 28, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 24, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 28, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; MUBUF: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 24, addrspace 5)
+    ; MUBUF: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 28, addrspace 5)
     ; MUBUF: S_ENDPGM 0
     ; MUBUF-V2A-LABEL: name: test_spill_v8
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7
@@ -390,10 +620,10 @@ body:             |
     ; MUBUF-V2A: S_ENDPGM 0
     ; FLATSCR-LABEL: name: test_spill_v8
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
-    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 16 into %stack.0, align 4, addrspace 5)
-    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 16 into %stack.0 + 16, align 4, addrspace 5)
-    ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 16 from %stack.0, align 4, addrspace 5)
-    ; FLATSCR: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 16 from %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 16 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 16 into %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 16 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 16 from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_v8
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7
@@ -415,6 +645,72 @@ body:             |
     ; FLATSCR-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; FLATSCR-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; FLATSCR-V2A: S_ENDPGM 0
+    ; MUBUF-GFX90A-LABEL: name: test_spill_v8
+    ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 24, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 28, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 24, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 28, addrspace 5)
+    ; MUBUF-GFX90A: S_ENDPGM 0
+    ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v8
+    ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7
+    ; MUBUF-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
+    ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; MUBUF-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; MUBUF-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; MUBUF-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; MUBUF-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; MUBUF-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; MUBUF-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; MUBUF-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; MUBUF-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; MUBUF-GFX90A-V2A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-LABEL: name: test_spill_v8
+    ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 16 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 16 into %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 16 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 16 from %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v8
+    ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7
+    ; FLATSCR-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; FLATSCR-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; FLATSCR-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; FLATSCR-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; FLATSCR-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; FLATSCR-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; FLATSCR-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; FLATSCR-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; FLATSCR-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; FLATSCR-GFX90A-V2A: S_ENDPGM 0
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
     SI_SPILL_V256_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store 32 into %stack.0, align 4, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 32 from %stack.0, align 4, addrspace 5)
@@ -435,38 +731,38 @@ body:             |
   bb.0.entry:
     ; MUBUF-LABEL: name: test_spill_v16
     ; MUBUF: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 4, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 8, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 12, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 16, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 20, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 24, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 28, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 32, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 36, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 40, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 44, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr12, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 48, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr13, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 52, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 56, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 60, addrspace 5)
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0, addrspace 5)
-    ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 4, addrspace 5)
-    ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 8, addrspace 5)
-    ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 12, addrspace 5)
-    ; MUBUF: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 16, addrspace 5)
-    ; MUBUF: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 20, addrspace 5)
-    ; MUBUF: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 24, addrspace 5)
-    ; MUBUF: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 28, addrspace 5)
-    ; MUBUF: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 32, addrspace 5)
-    ; MUBUF: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 36, addrspace 5)
-    ; MUBUF: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 40, addrspace 5)
-    ; MUBUF: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 44, addrspace 5)
-    ; MUBUF: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 48, addrspace 5)
-    ; MUBUF: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 52, addrspace 5)
-    ; MUBUF: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 56, addrspace 5)
-    ; MUBUF: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 60, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 24, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 28, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 32, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 36, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 40, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 44, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr12, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 48, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr13, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 52, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 56, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 60, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; MUBUF: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 24, addrspace 5)
+    ; MUBUF: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 28, addrspace 5)
+    ; MUBUF: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 32, addrspace 5)
+    ; MUBUF: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 36, addrspace 5)
+    ; MUBUF: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 40, addrspace 5)
+    ; MUBUF: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 44, addrspace 5)
+    ; MUBUF: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 48, addrspace 5)
+    ; MUBUF: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 52, addrspace 5)
+    ; MUBUF: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 56, addrspace 5)
+    ; MUBUF: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 60, addrspace 5)
     ; MUBUF: S_ENDPGM 0
     ; MUBUF-V2A-LABEL: name: test_spill_v16
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
@@ -506,14 +802,14 @@ body:             |
     ; MUBUF-V2A: S_ENDPGM 0
     ; FLATSCR-LABEL: name: test_spill_v16
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
-    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0, align 4, addrspace 5)
-    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 16, align 4, addrspace 5)
-    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr32, 32, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 32, align 4, addrspace 5)
-    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr12_vgpr13_vgpr14_vgpr15, $sgpr32, 48, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 48, align 4, addrspace 5)
-    ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0, align 4, addrspace 5)
-    ; FLATSCR: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 16, align 4, addrspace 5)
-    ; FLATSCR: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 32, align 4, addrspace 5)
-    ; FLATSCR: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 48, align 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 32, align 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr12_vgpr13_vgpr14_vgpr15, $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 48, align 4, addrspace 5)
+    ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 32, align 4, addrspace 5)
+    ; FLATSCR: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 48, align 4, addrspace 5)
     ; FLATSCR: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_v16
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
@@ -551,6 +847,124 @@ body:             |
     ; FLATSCR-V2A: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; FLATSCR-V2A: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; FLATSCR-V2A: S_ENDPGM 0
+    ; MUBUF-GFX90A-LABEL: name: test_spill_v16
+    ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 24, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 28, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 32, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 36, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 40, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 44, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr12, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 48, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr13, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 52, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 56, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 60, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 24, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 28, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 32, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 36, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 40, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 44, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 48, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 52, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 56, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 60, addrspace 5)
+    ; MUBUF-GFX90A: S_ENDPGM 0
+    ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v16
+    ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
+    ; MUBUF-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+    ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; MUBUF-GFX90A-V2A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-LABEL: name: test_spill_v16
+    ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 32, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr12_vgpr13_vgpr14_vgpr15, $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 48, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 32, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 48, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v16
+    ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; FLATSCR-GFX90A-V2A: S_ENDPGM 0
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
     SI_SPILL_V512_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store 64 into %stack.0, align 4, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 64 from %stack.0, align 4, addrspace 5)
@@ -571,70 +985,70 @@ body:             |
   bb.0.entry:
     ; MUBUF-LABEL: name: test_spill_v32
     ; MUBUF: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 4, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 8, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 12, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 16, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 20, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 24, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 28, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 32, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 36, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 40, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 44, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr12, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 48, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr13, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 52, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 56, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 60, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr16, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 64, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr17, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 68, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr18, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 72, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr19, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 76, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr20, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 80, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr21, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 84, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr22, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 88, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr23, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 92, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr24, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 96, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr25, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 100, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr26, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 104, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr27, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 108, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr28, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 112, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr29, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 116, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr30, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 120, addrspace 5)
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr31, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 124, addrspace 5)
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0, addrspace 5)
-    ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 4, addrspace 5)
-    ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 8, addrspace 5)
-    ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 12, addrspace 5)
-    ; MUBUF: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 16, addrspace 5)
-    ; MUBUF: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 20, addrspace 5)
-    ; MUBUF: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 24, addrspace 5)
-    ; MUBUF: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 28, addrspace 5)
-    ; MUBUF: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 32, addrspace 5)
-    ; MUBUF: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 36, addrspace 5)
-    ; MUBUF: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 40, addrspace 5)
-    ; MUBUF: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 44, addrspace 5)
-    ; MUBUF: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 48, addrspace 5)
-    ; MUBUF: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 52, addrspace 5)
-    ; MUBUF: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 56, addrspace 5)
-    ; MUBUF: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 60, addrspace 5)
-    ; MUBUF: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 64, addrspace 5)
-    ; MUBUF: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 68, addrspace 5)
-    ; MUBUF: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 72, addrspace 5)
-    ; MUBUF: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 76, addrspace 5)
-    ; MUBUF: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 80, addrspace 5)
-    ; MUBUF: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 84, addrspace 5)
-    ; MUBUF: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 88, addrspace 5)
-    ; MUBUF: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 92, addrspace 5)
-    ; MUBUF: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 96, addrspace 5)
-    ; MUBUF: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 100, addrspace 5)
-    ; MUBUF: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 104, addrspace 5)
-    ; MUBUF: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 108, addrspace 5)
-    ; MUBUF: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 112, addrspace 5)
-    ; MUBUF: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 116, addrspace 5)
-    ; MUBUF: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 120, addrspace 5)
-    ; MUBUF: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 124, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 24, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 28, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 32, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 36, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 40, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 44, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr12, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 48, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr13, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 52, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 56, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 60, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr16, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 64, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr17, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 68, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr18, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 72, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr19, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 76, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr20, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 80, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr21, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 84, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr22, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 88, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr23, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 92, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr24, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 96, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr25, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 100, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr26, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 104, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr27, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 108, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr28, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 112, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr29, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 116, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr30, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 120, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr31, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 124, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; MUBUF: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 24, addrspace 5)
+    ; MUBUF: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 28, addrspace 5)
+    ; MUBUF: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 32, addrspace 5)
+    ; MUBUF: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 36, addrspace 5)
+    ; MUBUF: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 40, addrspace 5)
+    ; MUBUF: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 44, addrspace 5)
+    ; MUBUF: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 48, addrspace 5)
+    ; MUBUF: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 52, addrspace 5)
+    ; MUBUF: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 56, addrspace 5)
+    ; MUBUF: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 60, addrspace 5)
+    ; MUBUF: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 64, addrspace 5)
+    ; MUBUF: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 68, addrspace 5)
+    ; MUBUF: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 72, addrspace 5)
+    ; MUBUF: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 76, addrspace 5)
+    ; MUBUF: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 80, addrspace 5)
+    ; MUBUF: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 84, addrspace 5)
+    ; MUBUF: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 88, addrspace 5)
+    ; MUBUF: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 92, addrspace 5)
+    ; MUBUF: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 96, addrspace 5)
+    ; MUBUF: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 100, addrspace 5)
+    ; MUBUF: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 104, addrspace 5)
+    ; MUBUF: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 108, addrspace 5)
+    ; MUBUF: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 112, addrspace 5)
+    ; MUBUF: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 116, addrspace 5)
+    ; MUBUF: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 120, addrspace 5)
+    ; MUBUF: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 124, addrspace 5)
     ; MUBUF: S_ENDPGM 0
     ; MUBUF-V2A-LABEL: name: test_spill_v32
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31
@@ -706,22 +1120,22 @@ body:             |
     ; MUBUF-V2A: S_ENDPGM 0
     ; FLATSCR-LABEL: name: test_spill_v32
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
-    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0, align 4, addrspace 5)
-    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 16, align 4, addrspace 5)
-    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr32, 32, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 32, align 4, addrspace 5)
-    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr12_vgpr13_vgpr14_vgpr15, $sgpr32, 48, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 48, align 4, addrspace 5)
-    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr16_vgpr17_vgpr18_vgpr19, $sgpr32, 64, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 64, align 4, addrspace 5)
-    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr20_vgpr21_vgpr22_vgpr23, $sgpr32, 80, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 80, align 4, addrspace 5)
-    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr24_vgpr25_vgpr26_vgpr27, $sgpr32, 96, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 96, align 4, addrspace 5)
-    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr28_vgpr29_vgpr30_vgpr31, $sgpr32, 112, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 112, align 4, addrspace 5)
-    ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0, align 4, addrspace 5)
-    ; FLATSCR: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 16, align 4, addrspace 5)
-    ; FLATSCR: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 32, align 4, addrspace 5)
-    ; FLATSCR: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 48, align 4, addrspace 5)
-    ; FLATSCR: $vgpr16_vgpr17_vgpr18_vgpr19 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 64, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 64, align 4, addrspace 5)
-    ; FLATSCR: $vgpr20_vgpr21_vgpr22_vgpr23 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 80, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 80, align 4, addrspace 5)
-    ; FLATSCR: $vgpr24_vgpr25_vgpr26_vgpr27 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 96, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 96, align 4, addrspace 5)
-    ; FLATSCR: $vgpr28_vgpr29_vgpr30_vgpr31 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 112, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 112, align 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 32, align 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr12_vgpr13_vgpr14_vgpr15, $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 48, align 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr16_vgpr17_vgpr18_vgpr19, $sgpr32, 64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 64, align 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr20_vgpr21_vgpr22_vgpr23, $sgpr32, 80, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 80, align 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr24_vgpr25_vgpr26_vgpr27, $sgpr32, 96, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 96, align 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr28_vgpr29_vgpr30_vgpr31, $sgpr32, 112, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 112, align 4, addrspace 5)
+    ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 32, align 4, addrspace 5)
+    ; FLATSCR: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 48, align 4, addrspace 5)
+    ; FLATSCR: $vgpr16_vgpr17_vgpr18_vgpr19 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 64, align 4, addrspace 5)
+    ; FLATSCR: $vgpr20_vgpr21_vgpr22_vgpr23 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 80, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 80, align 4, addrspace 5)
+    ; FLATSCR: $vgpr24_vgpr25_vgpr26_vgpr27 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 96, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 96, align 4, addrspace 5)
+    ; FLATSCR: $vgpr28_vgpr29_vgpr30_vgpr31 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 112, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 112, align 4, addrspace 5)
     ; FLATSCR: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_v32
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31
@@ -791,6 +1205,228 @@ body:             |
     ; FLATSCR-V2A: $vgpr30 = V_ACCVGPR_READ_B32_e64 $agpr30, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
     ; FLATSCR-V2A: $vgpr31 = V_ACCVGPR_READ_B32_e64 $agpr31, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
     ; FLATSCR-V2A: S_ENDPGM 0
+    ; MUBUF-GFX90A-LABEL: name: test_spill_v32
+    ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 24, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 28, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 32, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 36, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 40, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 44, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr12, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 48, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr13, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 52, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 56, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 60, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr16, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 64, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr17, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 68, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr18, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 72, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr19, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 76, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr20, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 80, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr21, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 84, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr22, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 88, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr23, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 92, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr24, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 96, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr25, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 100, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr26, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 104, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr27, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 108, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr28, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 112, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr29, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 116, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr30, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 120, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr31, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 124, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 24, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 28, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 32, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 36, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 40, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 44, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 48, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 52, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 56, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 60, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 64, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 68, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 72, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 76, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 80, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 84, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 88, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 92, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 96, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 100, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 104, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 108, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 112, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 116, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 120, addrspace 5)
+    ; MUBUF-GFX90A: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 124, addrspace 5)
+    ; MUBUF-GFX90A: S_ENDPGM 0
+    ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v32
+    ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
+    ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr16, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr17, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr18, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr19, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr20, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr21, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr22, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr23, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr24, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr25, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr26, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr27, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr28, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr29, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr30, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr31, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr16 = V_ACCVGPR_READ_B32_e64 $agpr16, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr17 = V_ACCVGPR_READ_B32_e64 $agpr17, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr18 = V_ACCVGPR_READ_B32_e64 $agpr18, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr19 = V_ACCVGPR_READ_B32_e64 $agpr19, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr20 = V_ACCVGPR_READ_B32_e64 $agpr20, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr21 = V_ACCVGPR_READ_B32_e64 $agpr21, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr22 = V_ACCVGPR_READ_B32_e64 $agpr22, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr23 = V_ACCVGPR_READ_B32_e64 $agpr23, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr24 = V_ACCVGPR_READ_B32_e64 $agpr24, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr25 = V_ACCVGPR_READ_B32_e64 $agpr25, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr26 = V_ACCVGPR_READ_B32_e64 $agpr26, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr27 = V_ACCVGPR_READ_B32_e64 $agpr27, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr28 = V_ACCVGPR_READ_B32_e64 $agpr28, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr29 = V_ACCVGPR_READ_B32_e64 $agpr29, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr30 = V_ACCVGPR_READ_B32_e64 $agpr30, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: $vgpr31 = V_ACCVGPR_READ_B32_e64 $agpr31, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; MUBUF-GFX90A-V2A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-LABEL: name: test_spill_v32
+    ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 32, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr12_vgpr13_vgpr14_vgpr15, $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 48, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr16_vgpr17_vgpr18_vgpr19, $sgpr32, 64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 64, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr20_vgpr21_vgpr22_vgpr23, $sgpr32, 80, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 80, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr24_vgpr25_vgpr26_vgpr27, $sgpr32, 96, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 96, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr28_vgpr29_vgpr30_vgpr31, $sgpr32, 112, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 112, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 32, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 48, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr16_vgpr17_vgpr18_vgpr19 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 64, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr20_vgpr21_vgpr22_vgpr23 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 80, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 80, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr24_vgpr25_vgpr26_vgpr27 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 96, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 96, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $vgpr28_vgpr29_vgpr30_vgpr31 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 112, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 112, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v32
+    ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr16, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr17, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr18, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr19, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr20, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr21, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr22, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr23, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr24, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr25, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr26, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr27, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr28, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr29, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr30, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr31, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr16 = V_ACCVGPR_READ_B32_e64 $agpr16, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr17 = V_ACCVGPR_READ_B32_e64 $agpr17, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr18 = V_ACCVGPR_READ_B32_e64 $agpr18, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr19 = V_ACCVGPR_READ_B32_e64 $agpr19, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr20 = V_ACCVGPR_READ_B32_e64 $agpr20, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr21 = V_ACCVGPR_READ_B32_e64 $agpr21, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr22 = V_ACCVGPR_READ_B32_e64 $agpr22, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr23 = V_ACCVGPR_READ_B32_e64 $agpr23, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr24 = V_ACCVGPR_READ_B32_e64 $agpr24, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr25 = V_ACCVGPR_READ_B32_e64 $agpr25, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr26 = V_ACCVGPR_READ_B32_e64 $agpr26, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr27 = V_ACCVGPR_READ_B32_e64 $agpr27, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr28 = V_ACCVGPR_READ_B32_e64 $agpr28, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr29 = V_ACCVGPR_READ_B32_e64 $agpr29, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr30 = V_ACCVGPR_READ_B32_e64 $agpr30, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr31 = V_ACCVGPR_READ_B32_e64 $agpr31, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; FLATSCR-GFX90A-V2A: S_ENDPGM 0
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
     SI_SPILL_V1024_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, %stack.0, $sgpr32, 0, implicit $exec :: (store 128 into %stack.0, align 4, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = SI_SPILL_V1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 128 from %stack.0, align 4, addrspace 5)
@@ -812,8 +1448,8 @@ body:             |
     ; MUBUF-LABEL: name: test_spill_a1
     ; MUBUF: $agpr0 = IMPLICIT_DEF
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
     ; MUBUF: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; MUBUF: S_ENDPGM 0
     ; MUBUF-V2A-LABEL: name: test_spill_a1
@@ -825,8 +1461,8 @@ body:             |
     ; FLATSCR-LABEL: name: test_spill_a1
     ; FLATSCR: $agpr0 = IMPLICIT_DEF
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5)
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
     ; FLATSCR: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; FLATSCR: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_a1
@@ -835,6 +1471,28 @@ body:             |
     ; FLATSCR-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
     ; FLATSCR-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
     ; FLATSCR-V2A: S_ENDPGM 0
+    ; MUBUF-GFX90A-LABEL: name: test_spill_a1
+    ; MUBUF-GFX90A: $agpr0 = IMPLICIT_DEF
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: S_ENDPGM 0
+    ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a1
+    ; MUBUF-GFX90A-V2A: liveins: $vgpr0
+    ; MUBUF-GFX90A-V2A: $agpr0 = IMPLICIT_DEF
+    ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
+    ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    ; MUBUF-GFX90A-V2A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-LABEL: name: test_spill_a1
+    ; FLATSCR-GFX90A: $agpr0 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORD_SADDR killed $agpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+    ; FLATSCR-GFX90A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a1
+    ; FLATSCR-GFX90A-V2A: liveins: $vgpr0
+    ; FLATSCR-GFX90A-V2A: $agpr0 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
+    ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    ; FLATSCR-GFX90A-V2A: S_ENDPGM 0
     $agpr0 = IMPLICIT_DEF
     SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, align 4, addrspace 5)
     $agpr0 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5)
@@ -856,12 +1514,12 @@ body:             |
     ; MUBUF-LABEL: name: test_spill_a2
     ; MUBUF: $agpr0_agpr1 = IMPLICIT_DEF
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
     ; MUBUF: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5)
     ; MUBUF: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1
     ; MUBUF: S_ENDPGM 0
     ; MUBUF-V2A-LABEL: name: test_spill_a2
@@ -875,12 +1533,12 @@ body:             |
     ; FLATSCR-LABEL: name: test_spill_a2
     ; FLATSCR: $agpr0_agpr1 = IMPLICIT_DEF
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
     ; FLATSCR: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5)
     ; FLATSCR: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1
     ; FLATSCR: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_a2
@@ -891,6 +1549,34 @@ body:             |
     ; FLATSCR-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1
     ; FLATSCR-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1
     ; FLATSCR-V2A: S_ENDPGM 0
+    ; MUBUF-GFX90A-LABEL: name: test_spill_a2
+    ; MUBUF-GFX90A: $agpr0_agpr1 = IMPLICIT_DEF
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: S_ENDPGM 0
+    ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a2
+    ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1
+    ; MUBUF-GFX90A-V2A: $agpr0_agpr1 = IMPLICIT_DEF
+    ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $agpr0_agpr1
+    ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit killed $agpr0_agpr1
+    ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1
+    ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1
+    ; MUBUF-GFX90A-V2A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-LABEL: name: test_spill_a2
+    ; FLATSCR-GFX90A: $agpr0_agpr1 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX2_SADDR killed $agpr0_agpr1, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr0_agpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a2
+    ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1
+    ; FLATSCR-GFX90A-V2A: $agpr0_agpr1 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $agpr0_agpr1
+    ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit killed $agpr0_agpr1
+    ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1
+    ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1
+    ; FLATSCR-GFX90A-V2A: S_ENDPGM 0
     $agpr0_agpr1 = IMPLICIT_DEF
     SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5)
     $agpr0_agpr1 = SI_SPILL_A64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5)
@@ -912,16 +1598,16 @@ body:             |
     ; MUBUF-LABEL: name: test_spill_a3
     ; MUBUF: $agpr0_agpr1_agpr2 = IMPLICIT_DEF
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2 :: (store 4 into %stack.0, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2 :: (store 4 into %stack.0 + 4, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2 :: (store 4 into %stack.0 + 8, addrspace 5)
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
     ; MUBUF: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5)
     ; MUBUF: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5)
     ; MUBUF: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
     ; MUBUF: S_ENDPGM 0
     ; MUBUF-V2A-LABEL: name: test_spill_a3
@@ -937,16 +1623,16 @@ body:             |
     ; FLATSCR-LABEL: name: test_spill_a3
     ; FLATSCR: $agpr0_agpr1_agpr2 = IMPLICIT_DEF
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2 :: (store 4 into %stack.0, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2 :: (store 4 into %stack.0, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2 :: (store 4 into %stack.0 + 4, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2 :: (store 4 into %stack.0 + 8, addrspace 5)
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
     ; FLATSCR: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5)
     ; FLATSCR: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5)
     ; FLATSCR: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
     ; FLATSCR: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_a3
@@ -959,6 +1645,40 @@ body:             |
     ; FLATSCR-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2
     ; FLATSCR-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2
     ; FLATSCR-V2A: S_ENDPGM 0
+    ; MUBUF-GFX90A-LABEL: name: test_spill_a3
+    ; MUBUF-GFX90A: $agpr0_agpr1_agpr2 = IMPLICIT_DEF
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr0_agpr1_agpr2 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: S_ENDPGM 0
+    ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a3
+    ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; MUBUF-GFX90A-V2A: $agpr0_agpr1_agpr2 = IMPLICIT_DEF
+    ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr0_agpr1_agpr2
+    ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2
+    ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2
+    ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
+    ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2
+    ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2
+    ; MUBUF-GFX90A-V2A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-LABEL: name: test_spill_a3
+    ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX3_SADDR killed $agpr0_agpr1_agpr2, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a3
+    ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; FLATSCR-GFX90A-V2A: $agpr0_agpr1_agpr2 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr0_agpr1_agpr2
+    ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2
+    ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2
+    ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
+    ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2
+    ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2
+    ; FLATSCR-GFX90A-V2A: S_ENDPGM 0
     $agpr0_agpr1_agpr2 = IMPLICIT_DEF
     SI_SPILL_A96_SAVE killed $agpr0_agpr1_agpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store 12 into %stack.0, align 4, addrspace 5)
     $agpr0_agpr1_agpr2 = SI_SPILL_A96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 12 from %stack.0, align 4, addrspace 5)
@@ -980,20 +1700,20 @@ body:             |
     ; MUBUF-LABEL: name: test_spill_a4
     ; MUBUF: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 4, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 8, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 12, addrspace 5)
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
     ; MUBUF: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5)
     ; MUBUF: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5)
     ; MUBUF: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5)
     ; MUBUF: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
     ; MUBUF: S_ENDPGM 0
     ; MUBUF-V2A-LABEL: name: test_spill_a4
@@ -1011,20 +1731,20 @@ body:             |
     ; FLATSCR-LABEL: name: test_spill_a4
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 4, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 8, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 12, addrspace 5)
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
     ; FLATSCR: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5)
     ; FLATSCR: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5)
     ; FLATSCR: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5)
     ; FLATSCR: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
     ; FLATSCR: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_a4
@@ -1039,6 +1759,46 @@ body:             |
     ; FLATSCR-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
     ; FLATSCR-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
     ; FLATSCR-V2A: S_ENDPGM 0
+    ; MUBUF-GFX90A-LABEL: name: test_spill_a4
+    ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: $agpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: S_ENDPGM 0
+    ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a4
+    ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; MUBUF-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
+    ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3
+    ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3
+    ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
+    ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
+    ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
+    ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
+    ; MUBUF-GFX90A-V2A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-LABEL: name: test_spill_a4
+    ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a4
+    ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; FLATSCR-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3
+    ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3
+    ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
+    ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
+    ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
+    ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
+    ; FLATSCR-GFX90A-V2A: S_ENDPGM 0
     $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
     SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5)
     $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 16 from %stack.0, align 4, addrspace 5)
@@ -1060,24 +1820,24 @@ body:             |
     ; MUBUF-LABEL: name: test_spill_a5
     ; MUBUF: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 4, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 8, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 12, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 16, addrspace 5)
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
     ; MUBUF: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5)
     ; MUBUF: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5)
     ; MUBUF: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5)
     ; MUBUF: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 16, addrspace 5)
     ; MUBUF: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
     ; MUBUF: S_ENDPGM 0
     ; MUBUF-V2A-LABEL: name: test_spill_a5
@@ -1097,24 +1857,24 @@ body:             |
     ; FLATSCR-LABEL: name: test_spill_a5
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 4, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 8, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 12, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 16, addrspace 5)
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
     ; FLATSCR: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5)
     ; FLATSCR: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5)
     ; FLATSCR: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5)
     ; FLATSCR: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 16, addrspace 5)
     ; FLATSCR: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
     ; FLATSCR: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_a5
@@ -1131,6 +1891,54 @@ body:             |
     ; FLATSCR-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
     ; FLATSCR-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
     ; FLATSCR-V2A: S_ENDPGM 0
+    ; MUBUF-GFX90A-LABEL: name: test_spill_a5
+    ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF-GFX90A: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: $agpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: $agpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF-GFX90A: S_ENDPGM 0
+    ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a5
+    ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+    ; MUBUF-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
+    ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
+    ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
+    ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
+    ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
+    ; MUBUF-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4
+    ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
+    ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
+    ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
+    ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
+    ; MUBUF-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
+    ; MUBUF-GFX90A-V2A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-LABEL: name: test_spill_a5
+    ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 16 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORD_SADDR killed $agpr4, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load 16 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; FLATSCR-GFX90A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a5
+    ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+    ; FLATSCR-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
+    ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
+    ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
+    ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
+    ; FLATSCR-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4
+    ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
+    ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
+    ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
+    ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
+    ; FLATSCR-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
+    ; FLATSCR-GFX90A-V2A: S_ENDPGM 0
     $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
     SI_SPILL_A160_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store 20 into %stack.0, align 4, addrspace 5)
     $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_A160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 20 from %stack.0, align 4, addrspace 5)
@@ -1152,28 +1960,28 @@ body:             |
     ; MUBUF-LABEL: name: test_spill_a6
     ; MUBUF: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 4, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 8, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 12, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 16, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 20, addrspace 5)
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
     ; MUBUF: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5)
     ; MUBUF: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5)
     ; MUBUF: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5)
     ; MUBUF: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 16, addrspace 5)
     ; MUBUF: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 20, addrspace 5)
     ; MUBUF: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; MUBUF: S_ENDPGM 0
     ; MUBUF-V2A-LABEL: name: test_spill_a6
@@ -1195,28 +2003,28 @@ body:             |
     ; FLATSCR-LABEL: name: test_spill_a6
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 4, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 8, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 12, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 16, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 20, addrspace 5)
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
     ; FLATSCR: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5)
     ; FLATSCR: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5)
     ; FLATSCR: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5)
     ; FLATSCR: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 16, addrspace 5)
     ; FLATSCR: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 20, addrspace 5)
     ; FLATSCR: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; FLATSCR: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_a6
@@ -1235,6 +2043,60 @@ body:             |
     ; FLATSCR-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; FLATSCR-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; FLATSCR-V2A: S_ENDPGM 0
+    ; MUBUF-GFX90A-LABEL: name: test_spill_a6
+    ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; MUBUF-GFX90A: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: $agpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: $agpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF-GFX90A: $agpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; MUBUF-GFX90A: S_ENDPGM 0
+    ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a6
+    ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; MUBUF-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF
+    ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; MUBUF-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; MUBUF-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; MUBUF-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; MUBUF-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; MUBUF-GFX90A-V2A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-LABEL: name: test_spill_a6
+    ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 16 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX2_SADDR killed $agpr4_agpr5, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 8 into %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load 16 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr4_agpr5 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load 8 from %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a6
+    ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; FLATSCR-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; FLATSCR-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; FLATSCR-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; FLATSCR-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; FLATSCR-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ; FLATSCR-GFX90A-V2A: S_ENDPGM 0
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF
     SI_SPILL_A192_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store 24 into %stack.0, align 4, addrspace 5)
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_A192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 24 from %stack.0, align 4, addrspace 5)
@@ -1256,36 +2118,36 @@ body:             |
     ; MUBUF-LABEL: name: test_spill_a8
     ; MUBUF: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 4, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 8, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 12, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 16, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 20, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 24, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 24, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 28, addrspace 5)
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 28, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
     ; MUBUF: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5)
     ; MUBUF: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5)
     ; MUBUF: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5)
     ; MUBUF: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 16, addrspace 5)
     ; MUBUF: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 20, addrspace 5)
     ; MUBUF: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 24, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 24, addrspace 5)
     ; MUBUF: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 28, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 28, addrspace 5)
     ; MUBUF: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; MUBUF: S_ENDPGM 0
     ; MUBUF-V2A-LABEL: name: test_spill_a8
@@ -1311,36 +2173,36 @@ body:             |
     ; FLATSCR-LABEL: name: test_spill_a8
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 4, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 8, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 12, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 16, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 20, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 24, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 24, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 24, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 24, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 28, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 28, addrspace 5)
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 28, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 28, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
     ; FLATSCR: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5)
     ; FLATSCR: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5)
     ; FLATSCR: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5)
     ; FLATSCR: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 16, addrspace 5)
     ; FLATSCR: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 20, addrspace 5)
     ; FLATSCR: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 24, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 24, addrspace 5)
     ; FLATSCR: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 28, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 28, addrspace 5)
     ; FLATSCR: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; FLATSCR: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_a8
@@ -1363,6 +2225,72 @@ body:             |
     ; FLATSCR-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; FLATSCR-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; FLATSCR-V2A: S_ENDPGM 0
+    ; MUBUF-GFX90A-LABEL: name: test_spill_a8
+    ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 24, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 28, addrspace 5)
+    ; MUBUF-GFX90A: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: $agpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: $agpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF-GFX90A: $agpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; MUBUF-GFX90A: $agpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load 4 from %stack.0 + 24, addrspace 5)
+    ; MUBUF-GFX90A: $agpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load 4 from %stack.0 + 28, addrspace 5)
+    ; MUBUF-GFX90A: S_ENDPGM 0
+    ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a8
+    ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+    ; MUBUF-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
+    ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; MUBUF-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; MUBUF-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; MUBUF-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; MUBUF-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; MUBUF-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; MUBUF-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; MUBUF-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; MUBUF-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; MUBUF-GFX90A-V2A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-LABEL: name: test_spill_a8
+    ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 16 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr4_agpr5_agpr6_agpr7, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 16 into %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load 16 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr4_agpr5_agpr6_agpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load 16 from %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a8
+    ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+    ; FLATSCR-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; FLATSCR-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; FLATSCR-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; FLATSCR-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; FLATSCR-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; FLATSCR-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; FLATSCR-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; FLATSCR-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; FLATSCR-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ; FLATSCR-GFX90A-V2A: S_ENDPGM 0
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
     SI_SPILL_A256_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store 32 into %stack.0, align 4, addrspace 5)
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_A256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 32 from %stack.0, align 4, addrspace 5)
@@ -1384,68 +2312,68 @@ body:             |
     ; MUBUF-LABEL: name: test_spill_a16
     ; MUBUF: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 4, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 8, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 12, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 16, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 20, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 24, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 24, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 28, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 28, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 32, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 32, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 36, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 36, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 40, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 40, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 44, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 44, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 48, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 48, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 52, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 52, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 56, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 56, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 60, addrspace 5)
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 60, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
     ; MUBUF: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5)
     ; MUBUF: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5)
     ; MUBUF: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5)
     ; MUBUF: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 16, addrspace 5)
     ; MUBUF: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 20, addrspace 5)
     ; MUBUF: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 24, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 24, addrspace 5)
     ; MUBUF: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 28, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 28, addrspace 5)
     ; MUBUF: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 32, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 32, addrspace 5)
     ; MUBUF: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 36, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 36, addrspace 5)
     ; MUBUF: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 40, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 40, addrspace 5)
     ; MUBUF: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 44, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 44, addrspace 5)
     ; MUBUF: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 48, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 48, addrspace 5)
     ; MUBUF: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 52, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 52, addrspace 5)
     ; MUBUF: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 56, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 56, addrspace 5)
     ; MUBUF: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 60, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 60, addrspace 5)
     ; MUBUF: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; MUBUF: S_ENDPGM 0
     ; MUBUF-V2A-LABEL: name: test_spill_a16
@@ -1487,68 +2415,68 @@ body:             |
     ; FLATSCR-LABEL: name: test_spill_a16
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 4, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 8, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 12, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 16, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 20, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 24, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 24, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 24, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 24, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 28, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 28, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 28, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 28, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 32, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 32, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 32, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 36, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 36, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 36, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 36, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 40, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 40, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 40, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 40, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 44, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 44, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 44, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 44, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 48, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 48, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 48, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 52, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 52, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 52, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 52, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 56, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 56, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 56, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 56, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 60, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 60, addrspace 5)
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 60, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 60, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
     ; FLATSCR: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5)
     ; FLATSCR: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5)
     ; FLATSCR: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5)
     ; FLATSCR: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 16, addrspace 5)
     ; FLATSCR: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 20, addrspace 5)
     ; FLATSCR: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 24, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 24, addrspace 5)
     ; FLATSCR: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 28, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 28, addrspace 5)
     ; FLATSCR: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 32, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 32, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 32, addrspace 5)
     ; FLATSCR: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 36, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 36, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 36, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 36, addrspace 5)
     ; FLATSCR: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 40, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 40, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 40, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 40, addrspace 5)
     ; FLATSCR: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 44, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 44, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 44, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 44, addrspace 5)
     ; FLATSCR: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 48, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 48, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 48, addrspace 5)
     ; FLATSCR: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 52, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 52, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 52, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 52, addrspace 5)
     ; FLATSCR: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 56, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 56, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 56, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 56, addrspace 5)
     ; FLATSCR: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 60, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 60, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 60, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 60, addrspace 5)
     ; FLATSCR: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; FLATSCR: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_a16
@@ -1587,6 +2515,124 @@ body:             |
     ; FLATSCR-V2A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; FLATSCR-V2A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; FLATSCR-V2A: S_ENDPGM 0
+    ; MUBUF-GFX90A-LABEL: name: test_spill_a16
+    ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 24, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 28, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 32, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 36, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 40, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 44, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr12, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 48, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr13, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 52, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 56, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 60, addrspace 5)
+    ; MUBUF-GFX90A: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: $agpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: $agpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF-GFX90A: $agpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; MUBUF-GFX90A: $agpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 24, addrspace 5)
+    ; MUBUF-GFX90A: $agpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 28, addrspace 5)
+    ; MUBUF-GFX90A: $agpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 32, addrspace 5)
+    ; MUBUF-GFX90A: $agpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 36, addrspace 5)
+    ; MUBUF-GFX90A: $agpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 40, addrspace 5)
+    ; MUBUF-GFX90A: $agpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 44, addrspace 5)
+    ; MUBUF-GFX90A: $agpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 48, addrspace 5)
+    ; MUBUF-GFX90A: $agpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 52, addrspace 5)
+    ; MUBUF-GFX90A: $agpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 56, addrspace 5)
+    ; MUBUF-GFX90A: $agpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 60, addrspace 5)
+    ; MUBUF-GFX90A: S_ENDPGM 0
+    ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a16
+    ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+    ; MUBUF-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
+    ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; MUBUF-GFX90A-V2A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-LABEL: name: test_spill_a16
+    ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 16 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr4_agpr5_agpr6_agpr7, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 16 into %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr8_agpr9_agpr10_agpr11, $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 16 into %stack.0 + 32, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr12_agpr13_agpr14_agpr15, $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 16 into %stack.0 + 48, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 16 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr4_agpr5_agpr6_agpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 16 from %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr8_agpr9_agpr10_agpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 16 from %stack.0 + 32, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr12_agpr13_agpr14_agpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 16 from %stack.0 + 48, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a16
+    ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+    ; FLATSCR-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; FLATSCR-GFX90A-V2A: S_ENDPGM 0
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
     SI_SPILL_A512_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store 64 into %stack.0, align 4, addrspace 5)
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_A512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 64 from %stack.0, align 4, addrspace 5)
@@ -1608,132 +2654,132 @@ body:             |
     ; MUBUF-LABEL: name: test_spill_a32
     ; MUBUF: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 4, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 8, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 12, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 16, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 20, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 24, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 24, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 28, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 28, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 32, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 32, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 36, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 36, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 40, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 40, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 44, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 44, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 48, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 48, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 52, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 52, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 56, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 56, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 60, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 60, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr16, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 64, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 64, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr17, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 68, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 68, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr18, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 72, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 72, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr19, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 76, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 76, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr20, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 80, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 80, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr21, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 84, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 84, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr22, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 88, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 88, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr23, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 92, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 92, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr24, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 96, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 96, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr25, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 100, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 100, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr26, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 104, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 104, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr27, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 108, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 108, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr28, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 112, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 112, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr29, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 116, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 116, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr30, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 120, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 120, addrspace 5)
     ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr31, implicit $exec
-    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 124, addrspace 5)
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 124, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
     ; MUBUF: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5)
     ; MUBUF: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5)
     ; MUBUF: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5)
     ; MUBUF: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 16, addrspace 5)
     ; MUBUF: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 20, addrspace 5)
     ; MUBUF: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 24, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 24, addrspace 5)
     ; MUBUF: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 28, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 28, addrspace 5)
     ; MUBUF: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 32, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 32, addrspace 5)
     ; MUBUF: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 36, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 36, addrspace 5)
     ; MUBUF: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 40, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 40, addrspace 5)
     ; MUBUF: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 44, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 44, addrspace 5)
     ; MUBUF: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 48, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 48, addrspace 5)
     ; MUBUF: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 52, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 52, addrspace 5)
     ; MUBUF: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 56, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 56, addrspace 5)
     ; MUBUF: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 60, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 60, addrspace 5)
     ; MUBUF: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 64, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 64, addrspace 5)
     ; MUBUF: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 68, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 68, addrspace 5)
     ; MUBUF: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 72, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 72, addrspace 5)
     ; MUBUF: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 76, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 76, addrspace 5)
     ; MUBUF: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 80, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 80, addrspace 5)
     ; MUBUF: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 84, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 84, addrspace 5)
     ; MUBUF: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 88, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 88, addrspace 5)
     ; MUBUF: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 92, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 92, addrspace 5)
     ; MUBUF: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 96, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 96, addrspace 5)
     ; MUBUF: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 100, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 100, addrspace 5)
     ; MUBUF: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 104, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 104, addrspace 5)
     ; MUBUF: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 108, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 108, addrspace 5)
     ; MUBUF: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 112, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 112, addrspace 5)
     ; MUBUF: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 116, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 116, addrspace 5)
     ; MUBUF: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 120, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 120, addrspace 5)
     ; MUBUF: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 124, addrspace 5)
+    ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 124, addrspace 5)
     ; MUBUF: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; MUBUF: S_ENDPGM 0
     ; MUBUF-V2A-LABEL: name: test_spill_a32
@@ -1807,132 +2853,132 @@ body:             |
     ; FLATSCR-LABEL: name: test_spill_a32
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 4, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 8, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 12, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 16, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 20, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 24, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 24, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 24, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 24, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 28, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 28, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 28, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 28, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 32, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 32, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 32, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 36, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 36, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 36, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 36, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 40, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 40, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 40, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 40, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 44, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 44, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 44, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 44, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 48, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 48, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 48, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 52, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 52, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 52, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 52, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 56, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 56, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 56, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 56, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 60, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 60, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 60, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 60, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr16, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 64, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 64, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 64, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr17, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 68, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 68, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 68, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 68, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr18, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 72, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 72, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 72, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 72, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr19, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 76, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 76, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 76, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 76, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr20, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 80, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 80, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 80, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 80, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr21, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 84, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 84, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 84, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 84, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr22, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 88, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 88, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 88, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 88, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr23, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 92, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 92, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 92, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 92, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr24, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 96, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 96, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 96, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 96, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr25, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 100, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 100, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 100, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr26, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 104, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 104, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 104, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 104, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr27, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 108, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 108, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 108, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 108, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr28, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 112, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 112, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 112, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 112, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr29, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 116, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 116, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 116, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 116, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr30, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 120, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 120, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 120, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 120, addrspace 5)
     ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr31, implicit $exec
-    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 124, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 124, addrspace 5)
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+    ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 124, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 124, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
     ; FLATSCR: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5)
     ; FLATSCR: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5)
     ; FLATSCR: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5)
     ; FLATSCR: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 16, addrspace 5)
     ; FLATSCR: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 20, addrspace 5)
     ; FLATSCR: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 24, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 24, addrspace 5)
     ; FLATSCR: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 28, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 28, addrspace 5)
     ; FLATSCR: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 32, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 32, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 32, addrspace 5)
     ; FLATSCR: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 36, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 36, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 36, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 36, addrspace 5)
     ; FLATSCR: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 40, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 40, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 40, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 40, addrspace 5)
     ; FLATSCR: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 44, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 44, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 44, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 44, addrspace 5)
     ; FLATSCR: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 48, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 48, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 48, addrspace 5)
     ; FLATSCR: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 52, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 52, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 52, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 52, addrspace 5)
     ; FLATSCR: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 56, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 56, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 56, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 56, addrspace 5)
     ; FLATSCR: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 60, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 60, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 60, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 60, addrspace 5)
     ; FLATSCR: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 64, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 64, addrspace 5)
     ; FLATSCR: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 68, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 68, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 68, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 68, addrspace 5)
     ; FLATSCR: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 72, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 72, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 72, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 72, addrspace 5)
     ; FLATSCR: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 76, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 76, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 76, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 76, addrspace 5)
     ; FLATSCR: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 80, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 80, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 80, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 80, addrspace 5)
     ; FLATSCR: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 84, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 84, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 84, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 84, addrspace 5)
     ; FLATSCR: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 88, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 88, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 88, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 88, addrspace 5)
     ; FLATSCR: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 92, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 92, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 92, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 92, addrspace 5)
     ; FLATSCR: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 96, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 96, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 96, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 96, addrspace 5)
     ; FLATSCR: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 100, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 100, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 100, addrspace 5)
     ; FLATSCR: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 104, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 104, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 104, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 104, addrspace 5)
     ; FLATSCR: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 108, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 108, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 108, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 108, addrspace 5)
     ; FLATSCR: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 112, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 112, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 112, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 112, addrspace 5)
     ; FLATSCR: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 116, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 116, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 116, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 116, addrspace 5)
     ; FLATSCR: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 120, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 120, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 120, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 120, addrspace 5)
     ; FLATSCR: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 124, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 124, addrspace 5)
+    ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 124, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 124, addrspace 5)
     ; FLATSCR: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; FLATSCR: S_ENDPGM 0
     ; FLATSCR-V2A-LABEL: name: test_spill_a32
@@ -2003,6 +3049,228 @@ body:             |
     ; FLATSCR-V2A: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr30, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; FLATSCR-V2A: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr31, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; FLATSCR-V2A: S_ENDPGM 0
+    ; MUBUF-GFX90A-LABEL: name: test_spill_a32
+    ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 16, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 20, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 24, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 28, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 32, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 36, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 40, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 44, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr12, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 48, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr13, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 52, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 56, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 60, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr16, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 64, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr17, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 68, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr18, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 72, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr19, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 76, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr20, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 80, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr21, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 84, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr22, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 88, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr23, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 92, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr24, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 96, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr25, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 100, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr26, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 104, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr27, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 108, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr28, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 112, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr29, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 116, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr30, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 120, addrspace 5)
+    ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr31, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 124, addrspace 5)
+    ; MUBUF-GFX90A: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0, addrspace 5)
+    ; MUBUF-GFX90A: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 4, addrspace 5)
+    ; MUBUF-GFX90A: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 8, addrspace 5)
+    ; MUBUF-GFX90A: $agpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 12, addrspace 5)
+    ; MUBUF-GFX90A: $agpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 16, addrspace 5)
+    ; MUBUF-GFX90A: $agpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 20, addrspace 5)
+    ; MUBUF-GFX90A: $agpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 24, addrspace 5)
+    ; MUBUF-GFX90A: $agpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 28, addrspace 5)
+    ; MUBUF-GFX90A: $agpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 32, addrspace 5)
+    ; MUBUF-GFX90A: $agpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 36, addrspace 5)
+    ; MUBUF-GFX90A: $agpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 40, addrspace 5)
+    ; MUBUF-GFX90A: $agpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 44, addrspace 5)
+    ; MUBUF-GFX90A: $agpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 48, addrspace 5)
+    ; MUBUF-GFX90A: $agpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 52, addrspace 5)
+    ; MUBUF-GFX90A: $agpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 56, addrspace 5)
+    ; MUBUF-GFX90A: $agpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 60, addrspace 5)
+    ; MUBUF-GFX90A: $agpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 64, addrspace 5)
+    ; MUBUF-GFX90A: $agpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 68, addrspace 5)
+    ; MUBUF-GFX90A: $agpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 72, addrspace 5)
+    ; MUBUF-GFX90A: $agpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 76, addrspace 5)
+    ; MUBUF-GFX90A: $agpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 80, addrspace 5)
+    ; MUBUF-GFX90A: $agpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 84, addrspace 5)
+    ; MUBUF-GFX90A: $agpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 88, addrspace 5)
+    ; MUBUF-GFX90A: $agpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 92, addrspace 5)
+    ; MUBUF-GFX90A: $agpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 96, addrspace 5)
+    ; MUBUF-GFX90A: $agpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 100, addrspace 5)
+    ; MUBUF-GFX90A: $agpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 104, addrspace 5)
+    ; MUBUF-GFX90A: $agpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 108, addrspace 5)
+    ; MUBUF-GFX90A: $agpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 112, addrspace 5)
+    ; MUBUF-GFX90A: $agpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 116, addrspace 5)
+    ; MUBUF-GFX90A: $agpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 120, addrspace 5)
+    ; MUBUF-GFX90A: $agpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 124, addrspace 5)
+    ; MUBUF-GFX90A: S_ENDPGM 0
+    ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a32
+    ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
+    ; MUBUF-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
+    ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr16 = V_ACCVGPR_READ_B32_e64 killed $agpr16, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr17 = V_ACCVGPR_READ_B32_e64 killed $agpr17, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr18 = V_ACCVGPR_READ_B32_e64 killed $agpr18, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr19 = V_ACCVGPR_READ_B32_e64 killed $agpr19, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr20 = V_ACCVGPR_READ_B32_e64 killed $agpr20, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr21 = V_ACCVGPR_READ_B32_e64 killed $agpr21, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr22 = V_ACCVGPR_READ_B32_e64 killed $agpr22, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr23 = V_ACCVGPR_READ_B32_e64 killed $agpr23, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr24 = V_ACCVGPR_READ_B32_e64 killed $agpr24, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr25 = V_ACCVGPR_READ_B32_e64 killed $agpr25, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr26 = V_ACCVGPR_READ_B32_e64 killed $agpr26, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr27 = V_ACCVGPR_READ_B32_e64 killed $agpr27, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr28 = V_ACCVGPR_READ_B32_e64 killed $agpr28, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr29 = V_ACCVGPR_READ_B32_e64 killed $agpr29, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr30 = V_ACCVGPR_READ_B32_e64 killed $agpr30, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $vgpr31 = V_ACCVGPR_READ_B32_e64 killed $agpr31, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $vgpr16, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $vgpr17, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $vgpr18, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $vgpr19, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $vgpr20, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $vgpr21, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $vgpr22, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $vgpr23, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $vgpr24, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $vgpr25, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $vgpr26, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $vgpr27, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $vgpr28, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $vgpr29, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr30, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr31, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; MUBUF-GFX90A-V2A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-LABEL: name: test_spill_a32
+    ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 16 into %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr4_agpr5_agpr6_agpr7, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 16 into %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr8_agpr9_agpr10_agpr11, $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 16 into %stack.0 + 32, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr12_agpr13_agpr14_agpr15, $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 16 into %stack.0 + 48, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr16_agpr17_agpr18_agpr19, $sgpr32, 64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 16 into %stack.0 + 64, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr20_agpr21_agpr22_agpr23, $sgpr32, 80, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 16 into %stack.0 + 80, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr24_agpr25_agpr26_agpr27, $sgpr32, 96, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 16 into %stack.0 + 96, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr28_agpr29_agpr30_agpr31, $sgpr32, 112, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 16 into %stack.0 + 112, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 16 from %stack.0, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr4_agpr5_agpr6_agpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 16 from %stack.0 + 16, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr8_agpr9_agpr10_agpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 16 from %stack.0 + 32, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr12_agpr13_agpr14_agpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 16 from %stack.0 + 48, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr16_agpr17_agpr18_agpr19 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 16 from %stack.0 + 64, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr20_agpr21_agpr22_agpr23 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 80, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 16 from %stack.0 + 80, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr24_agpr25_agpr26_agpr27 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 96, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 16 from %stack.0 + 96, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: $agpr28_agpr29_agpr30_agpr31 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 112, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 16 from %stack.0 + 112, align 4, addrspace 5)
+    ; FLATSCR-GFX90A: S_ENDPGM 0
+    ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a32
+    ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
+    ; FLATSCR-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
+    ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr16 = V_ACCVGPR_READ_B32_e64 killed $agpr16, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr17 = V_ACCVGPR_READ_B32_e64 killed $agpr17, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr18 = V_ACCVGPR_READ_B32_e64 killed $agpr18, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr19 = V_ACCVGPR_READ_B32_e64 killed $agpr19, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr20 = V_ACCVGPR_READ_B32_e64 killed $agpr20, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr21 = V_ACCVGPR_READ_B32_e64 killed $agpr21, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr22 = V_ACCVGPR_READ_B32_e64 killed $agpr22, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr23 = V_ACCVGPR_READ_B32_e64 killed $agpr23, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr24 = V_ACCVGPR_READ_B32_e64 killed $agpr24, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr25 = V_ACCVGPR_READ_B32_e64 killed $agpr25, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr26 = V_ACCVGPR_READ_B32_e64 killed $agpr26, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr27 = V_ACCVGPR_READ_B32_e64 killed $agpr27, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr28 = V_ACCVGPR_READ_B32_e64 killed $agpr28, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr29 = V_ACCVGPR_READ_B32_e64 killed $agpr29, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr30 = V_ACCVGPR_READ_B32_e64 killed $agpr30, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $vgpr31 = V_ACCVGPR_READ_B32_e64 killed $agpr31, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $vgpr16, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $vgpr17, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $vgpr18, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $vgpr19, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $vgpr20, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $vgpr21, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $vgpr22, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $vgpr23, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $vgpr24, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $vgpr25, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $vgpr26, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $vgpr27, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $vgpr28, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $vgpr29, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr30, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr31, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; FLATSCR-GFX90A-V2A: S_ENDPGM 0
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
     SI_SPILL_A1024_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, %stack.0, $sgpr32, 0, implicit $exec :: (store 128 into %stack.0, align 4, addrspace 5)
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_A1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 128 from %stack.0, align 4, addrspace 5)

diff  --git a/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir b/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir
index 51ac3653b61c..1fedbdc2cd93 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir
@@ -29,12 +29,12 @@ body:             |
   ; CHECK:   $sgpr0 = S_ADD_U32 $sgpr0, $sgpr4, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK:   $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK:   $sgpr5 = S_MOV_B32 524288
-  ; CHECK:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5)
+  ; CHECK:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5)
   ; CHECK:   S_BRANCH %bb.1
   ; CHECK: bb.1:
   ; CHECK:   liveins: $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK:   $sgpr4 = S_MOV_B32 524288
-  ; CHECK:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5)
+  ; CHECK:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5)
   ; CHECK:   S_ENDPGM 0, implicit $vgpr0
   bb.0:
     $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)

diff  --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
index 1b03b8c8f9e6..f824c532d0b1 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
@@ -27,7 +27,7 @@ body:             |
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2
     ; GFX8: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec
     ; GFX8: $sgpr32 = S_ADD_U32 $sgpr32, 8196, implicit-def $scc
-    ; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5)
+    ; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5)
     ; GFX8: $sgpr32 = S_SUB_U32 $sgpr32, 8196, implicit-def $scc
     ; GFX8: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GFX8: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2
@@ -35,7 +35,7 @@ body:             |
     ; GFX8: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def $scc
     ; GFX8: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
     ; GFX8: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
-    ; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5)
+    ; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5)
     ; GFX8: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX8: $vcc_lo = S_MOV_B32 8192
     ; GFX8: $vgpr3, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec
@@ -44,15 +44,15 @@ body:             |
     ; GFX8: $sgpr33 = V_READLANE_B32 $vgpr2, 0
     ; GFX8: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec
     ; GFX8: $vgpr0 = V_MOV_B32_e32 8196, implicit $exec
-    ; GFX8: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5)
+    ; GFX8: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5)
     ; GFX8: $exec = S_MOV_B64 killed $sgpr4_sgpr5
-    ; GFX8: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5)
+    ; GFX8: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5)
     ; GFX8: S_ENDPGM 0, csr_amdgpu_allvgprs
     ; GFX9-LABEL: name: pei_scavenge_vgpr_spill
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2
     ; GFX9: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec
     ; GFX9: $sgpr32 = S_ADD_U32 $sgpr32, 8196, implicit-def $scc
-    ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5)
+    ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5)
     ; GFX9: $sgpr32 = S_SUB_U32 $sgpr32, 8196, implicit-def $scc
     ; GFX9: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GFX9: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2
@@ -60,7 +60,7 @@ body:             |
     ; GFX9: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def $scc
     ; GFX9: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
     ; GFX9: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
-    ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5)
+    ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5)
     ; GFX9: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX9: $vgpr3 = V_ADD_U32_e32 8192, killed $vgpr3, implicit $exec
     ; GFX9: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec
@@ -68,15 +68,15 @@ body:             |
     ; GFX9: $sgpr33 = V_READLANE_B32 $vgpr2, 0
     ; GFX9: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec
     ; GFX9: $vgpr0 = V_MOV_B32_e32 8196, implicit $exec
-    ; GFX9: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5)
+    ; GFX9: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5)
     ; GFX9: $exec = S_MOV_B64 killed $sgpr4_sgpr5
-    ; GFX9: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5)
+    ; GFX9: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5)
     ; GFX9: S_ENDPGM 0, csr_amdgpu_allvgprs
     ; GFX9-FLATSCR-LABEL: name: pei_scavenge_vgpr_spill
     ; GFX9-FLATSCR: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2
     ; GFX9-FLATSCR: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec
     ; GFX9-FLATSCR: $sgpr4 = S_ADD_U32 $sgpr32, 8196, implicit-def $scc
-    ; GFX9-FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, killed $sgpr4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.3, addrspace 5)
+    ; GFX9-FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, killed $sgpr4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.3, addrspace 5)
     ; GFX9-FLATSCR: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GFX9-FLATSCR: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2
     ; GFX9-FLATSCR: $sgpr33 = frame-setup S_ADD_U32 $sgpr32, 8191, implicit-def $scc
@@ -89,7 +89,7 @@ body:             |
     ; GFX9-FLATSCR: $sgpr33 = V_READLANE_B32 $vgpr2, 0
     ; GFX9-FLATSCR: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec
     ; GFX9-FLATSCR: $sgpr4 = S_ADD_U32 $sgpr32, 8196, implicit-def $scc
-    ; GFX9-FLATSCR: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.3, addrspace 5)
+    ; GFX9-FLATSCR: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.3, addrspace 5)
     ; GFX9-FLATSCR: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GFX9-FLATSCR: S_ENDPGM 0, csr_amdgpu_allvgprs
     $vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir b/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir
index 1a2d187bed82..b72b94ecf85c 100644
--- a/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir
+++ b/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir
@@ -46,7 +46,7 @@ body:             |
     %15:sreg_32_xm0 = S_MOV_B32 61440
     %16:sreg_32_xm0 = S_MOV_B32 -1
     %17:sgpr_128 = REG_SEQUENCE undef %14:sreg_32_xm0, %subreg.sub0, undef %12:sreg_32_xm0, %subreg.sub1, %16, %subreg.sub2, %15, %subreg.sub3
-    BUFFER_STORE_DWORD_OFFSET %4, %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    BUFFER_STORE_DWORD_OFFSET %4, %17, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
     %19:vgpr_32 = COPY %4
     %20:sreg_64 = SI_IF %0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.3

diff  --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
index c135a77163ef..bc08fb098bd8 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
@@ -7,33 +7,33 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: bundle_memops
-    ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: S_NOP 0
     ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit undef $vgpr3_vgpr4, implicit $exec {
-    ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
-    ; GCN:   $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, 0, implicit $exec
     ; GCN: }
     ; GCN: S_NOP 0
-    ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr5, implicit-def $vgpr5_lo16, implicit-def $vgpr5_hi16, implicit undef $vgpr0_vgpr1, implicit $exec, implicit undef $vgpr3_vgpr4 {
-    ; GCN:   $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 4, 0, 0, 0, implicit $exec
-    ; GCN:   $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec
-    ; GCN:   $vgpr5 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 4, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr5 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: }
     ; GCN: BUNDLE implicit undef $vgpr3_vgpr4, implicit $vgpr1, implicit $exec, implicit $vgpr0 {
-    ; GCN:   GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec
-    ; GCN:   GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec
+    ; GCN:   GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN:   GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, 0, implicit $exec
     ; GCN: }
     ; GCN: S_NOP 0
     ; GCN: BUNDLE implicit undef $vgpr3_vgpr4, implicit $vgpr1, implicit $exec, implicit $vgpr0 {
-    ; GCN:   GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec
-    ; GCN:   GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec
-    ; GCN:   GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec
+    ; GCN:   GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN:   GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, 0, implicit $exec
+    ; GCN:   GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: }
     ; GCN: S_NOP 0
-    ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: S_NOP 0
-    ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec
+    ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, 0, implicit $exec
     ; GCN: BUNDLE implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit $vgpr0, implicit $exec, implicit $vgpr1 {
     ; GCN:   $vgpr2 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec
     ; GCN:   $vgpr3 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $exec
@@ -48,20 +48,20 @@ body:             |
     ; GCN:   $sgpr3 = S_LOAD_DWORD_SGPR undef $sgpr0_sgpr1, undef $sgpr10, 0, 0
     ; GCN: }
     ; GCN: BUNDLE implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit $vgpr0, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr2, implicit $exec, implicit $vgpr1 {
-    ; GCN:   $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN:   $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: }
     ; GCN: BUNDLE implicit $vgpr0, implicit $vgpr2_vgpr3, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
-    ; GCN:   BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; GCN:   BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN:   BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN:   BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: }
     ; GCN: BUNDLE implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
-    ; GCN:   $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
-    ; GCN:   $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    ; GCN:   $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    ; GCN:   $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
     ; GCN: }
     ; GCN: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
-    ; GCN:   IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
-    ; GCN:   IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
+    ; GCN:   IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
+    ; GCN:   IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
     ; GCN: }
     ; GCN: S_NOP 0
     ; GCN: $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71 = S_LOAD_DWORDX8_IMM undef $sgpr10_sgpr11, 464, 0, 0
@@ -71,25 +71,25 @@ body:             |
     ; GCN:   $vgpr2 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec
     ; GCN:   $vgpr3 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $exec
     ; GCN: }
-    $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
     S_NOP 0
-    $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
-    $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec
+    $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, 0, implicit $exec
     S_NOP 0
-    $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
-    $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 4, 0, 0, 0, implicit $exec
-    $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec
-    $vgpr5 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec
+    $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 4, 0, 0, 0, 0, implicit $exec
+    $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, 0, implicit $exec
+    $vgpr5 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, 0, implicit $exec
     S_NOP 0
-    GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, 0, implicit $exec
     S_NOP 0
-    $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
     S_NOP 0
-    GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, 0, implicit $exec
     $vgpr2 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec
     $vgpr3 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $exec
     DS_WRITE_B32_gfx9 $vgpr0, $vgpr2, 0, 0, implicit killed $m0, implicit $exec
@@ -97,14 +97,14 @@ body:             |
     S_NOP 0
     $sgpr2 = S_LOAD_DWORD_IMM undef $sgpr0_sgpr1, 0, 0, 0
     $sgpr3 = S_LOAD_DWORD_SGPR undef $sgpr0_sgpr1, undef $sgpr10, 0, 0
-    $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, implicit $exec
-    BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3,  undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3,  undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
-    $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
-    IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
-    IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
+    $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3,  undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3,  undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
+    IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
     S_NOP 0
     $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71 = S_LOAD_DWORDX8_IMM undef $sgpr10_sgpr11, 464, 0, 0
     $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75 = S_LOAD_DWORDX8_IMM undef $sgpr10_sgpr11, 128, 0, 0
@@ -123,13 +123,13 @@ body:             |
     ; GCN-LABEL: name: bundle_dbg_value_0
     ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
     ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
-    ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
     ; GCN:   DBG_VALUE internal $vgpr0, 0, 0
-    ; GCN:   $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: }
-    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
     DBG_VALUE $vgpr0, 0, 0
-    $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec
 
 ...
 
@@ -143,16 +143,16 @@ body:             |
     ; GCN-LABEL: name: bundle_dbg_value_1
     ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6, $vgpr1
     ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr1, implicit $vgpr5_vgpr6 {
-    ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
     ; GCN:   DBG_VALUE internal $vgpr0, 0, 0
     ; GCN:   DBG_VALUE $vgpr1, 0, 0
-    ; GCN:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: }
     ; GCN: DBG_VALUE $vgpr2, 0, 0
-    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
     DBG_VALUE $vgpr0, 0, 0
     DBG_VALUE $vgpr1, 0, 0
-    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
+    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec
     DBG_VALUE $vgpr2, 0, 0
 ...
 
@@ -167,15 +167,15 @@ body:             |
     ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6, $vgpr1
     ; GCN: DBG_VALUE $vgpr1, 0, 0
     ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
-    ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
     ; GCN:   DBG_VALUE internal $vgpr0, 0, 0
-    ; GCN:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: }
     ; GCN: DBG_VALUE $vgpr2, 0, 0
     DBG_VALUE $vgpr1, 0, 0
-    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
     DBG_VALUE $vgpr0, 0, 0
-    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
+    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec
     DBG_VALUE $vgpr2, 0, 0
 ...
 
@@ -189,14 +189,14 @@ body:             |
     ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
     ; GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
     ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr1, implicit $vgpr5_vgpr6 {
-    ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
     ; GCN:   KILL $vgpr1
-    ; GCN:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: }
     $vgpr1 = V_MOV_B32_e32 0, implicit $exec
-    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
     KILL $vgpr1
-    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
+    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec
 
 ...
 
@@ -210,14 +210,14 @@ body:             |
     ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
     ; GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
     ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
-    ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
     ; GCN:   KILL internal $vgpr0
-    ; GCN:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: }
     $vgpr1 = V_MOV_B32_e32 0, implicit $exec
-    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
     KILL $vgpr0
-    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
+    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec
 
 ...
 
@@ -232,11 +232,11 @@ body:             |
 
     ; GCN-LABEL: name: post_bundle_kill
     ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
-    ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
-    ; GCN:   $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: }
-    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
-    $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec
     KILL killed $vgpr3_vgpr4, killed $vgpr5_vgpr6
 ...
 
@@ -249,13 +249,13 @@ body:             |
     ; GCN-LABEL: name: post_bundle_kill_other
     ; GCN: $vgpr7 = V_MOV_B32_e32 0, implicit $exec
     ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
-    ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
-    ; GCN:   $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: }
     ; GCN: KILL killed $vgpr7
     $vgpr7 = V_MOV_B32_e32 0, implicit $exec
-    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
-    $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec
     KILL killed $vgpr7
 ...
 
@@ -269,12 +269,12 @@ body:             |
     ; GCN-LABEL: name: post_bundle_kill_plus_other
     ; GCN: $vgpr7 = V_MOV_B32_e32 0, implicit $exec
     ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
-    ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
-    ; GCN:   $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN:   $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: }
     ; GCN: KILL killed $vgpr7, killed $vgpr3
     $vgpr7 = V_MOV_B32_e32 0, implicit $exec
-    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
-    $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec
     KILL killed $vgpr7, killed $vgpr3
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir b/llvm/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir
index 72a2998890d1..d43d8a96601d 100644
--- a/llvm/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir
+++ b/llvm/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir
@@ -16,6 +16,6 @@ body:             |
     S_BARRIER
     $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X1F32_e64 undef $vgpr0, undef $vgpr0, 0, 0, 0, 2, implicit $mode, implicit $exec
     $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr31, implicit $exec
-    BUFFER_STORE_DWORD_OFFEN killed $vgpr0, undef $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr6, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFEN killed $vgpr0, undef $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr6, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx10.mir b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx10.mir
index 56a4d8c239c7..850435312b74 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx10.mir
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx10.mir
@@ -34,12 +34,12 @@ body:             |
     %26:vgpr_32, %27:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %25, %21, 0, implicit $exec
     %28:vgpr_32, dead %29:sreg_32_xm0_xexec = V_ADDC_U32_e64 %23, 0, killed %27, 0, implicit $exec
     %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1
-    %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, implicit $exec
+    %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, 0, implicit $exec
     %32:sgpr_32 = S_MOV_B32 6144
     %33:vgpr_32, %34:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %21, %32, 0, implicit $exec
     %35:vgpr_32, dead %36:sreg_32_xm0_xexec = V_ADDC_U32_e64 %23, 0, killed %34, 0, implicit $exec
     %37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1
-    %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, implicit $exec
+    %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, 0, implicit $exec
 ...
 ---
 
@@ -87,17 +87,17 @@ body:             |
     %26:vgpr_32, %27:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %21, %25, 0, implicit $exec
     %28:vgpr_32, dead %29:sreg_32_xm0_xexec = V_ADDC_U32_e64 %23, 0, killed %27, 0, implicit $exec
     %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1
-    %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, implicit $exec
+    %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, 0, implicit $exec
     %32:sgpr_32 = S_MOV_B32 6400
     %33:vgpr_32, %34:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %21, %32, 0, implicit $exec
     %35:vgpr_32, dead %36:sreg_32_xm0_xexec = V_ADDC_U32_e64 %23, 0, killed %34, 0, implicit $exec
     %37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1
-    %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, implicit $exec
+    %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, 0, implicit $exec
     %39:sgpr_32 = S_MOV_B32 11200
     %40:vgpr_32, %41:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %21, %39, 0, implicit $exec
     %42:vgpr_32, dead %43:sreg_32_xm0_xexec = V_ADDC_U32_e64 %23, 0, killed %41, 0, implicit $exec
     %44:vreg_64 = REG_SEQUENCE %40, %subreg.sub0, %42, %subreg.sub1
-    %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, 0, implicit $exec
+    %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, 0, 0, implicit $exec
 ...
 ---
 
@@ -144,17 +144,17 @@ body:             |
     %26:vgpr_32, %27:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %21, %25, 0, implicit $exec
     %28:vgpr_32, dead %29:sreg_32_xm0_xexec = V_ADDC_U32_e64 %23, 0, killed %27, 0, implicit $exec
     %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1
-    %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, implicit $exec
+    %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, 0, implicit $exec
     %32:sgpr_32 = S_MOV_B32 8192
     %33:vgpr_32, %34:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %21, %32, 0, implicit $exec
     %35:vgpr_32, dead %36:sreg_32_xm0_xexec = V_ADDC_U32_e64 %23, 0, killed %34, 0, implicit $exec
     %37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1
-    %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, implicit $exec
+    %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, 0, implicit $exec
     %39:sgpr_32 = S_MOV_B32 10240
     %40:vgpr_32, %41:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %21, %39, 0, implicit $exec
     %42:vgpr_32, dead %43:sreg_32_xm0_xexec = V_ADDC_U32_e64 %23, 0, killed %41, 0, implicit $exec
     %44:vreg_64 = REG_SEQUENCE %40, %subreg.sub0, %42, %subreg.sub1
-    %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, 0, implicit $exec
+    %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, 0, 0, implicit $exec
 ...
 ---
 
@@ -190,7 +190,7 @@ body:             |
     %26:vgpr_32, %27:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %21, %25, 0, implicit $exec
     %28:vgpr_32, dead %29:sreg_32_xm0_xexec = V_ADDC_U32_e64 %23, 4294967295, killed %27, 0, implicit $exec
     %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1
-    %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, implicit $exec
+    %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, 0, implicit $exec
 ...
 ---
 
@@ -208,11 +208,11 @@ body:             |
     %2:vgpr_32, %3:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %0.sub0, %1, 0, implicit $exec
     %4:vgpr_32, dead %5:sreg_32_xm0_xexec = V_ADDC_U32_e64 %0.sub1, 0, %3, 0, implicit $exec
     %6:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %4, %subreg.sub1
-    GLOBAL_STORE_DWORD %6, %0.sub0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %6, %0.sub0, 0, 0, 0, 0, 0, implicit $exec
 
     %8:sgpr_32 = S_MOV_B32 3000
     %9:vgpr_32, %10:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %0.sub0, %8, 0, implicit $exec
     %11:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 %0.sub1, 0, %10, 0, implicit $exec
     %13:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1
-    GLOBAL_STORE_DWORD %13, %0.sub1, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %13, %0.sub1, 0, 0, 0, 0, 0, implicit $exec
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir
index c0edddb93cac..c16ca8edb371 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir
@@ -34,12 +34,12 @@ body:             |
     %26:vgpr_32, %27:sreg_64_xexec = V_ADD_CO_U32_e64 %25, %21, 0, implicit $exec
     %28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %27, 0, implicit $exec
     %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1
-    %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, implicit $exec
+    %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, 0, implicit $exec
     %32:sgpr_32 = S_MOV_B32 6144
     %33:vgpr_32, %34:sreg_64_xexec = V_ADD_CO_U32_e64 %21, %32, 0, implicit $exec
     %35:vgpr_32, dead %36:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %34, 0, implicit $exec
     %37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1
-    %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, implicit $exec
+    %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, 0, implicit $exec
 ...
 ---
 
@@ -87,17 +87,17 @@ body:             |
     %26:vgpr_32, %27:sreg_64_xexec = V_ADD_CO_U32_e64 %21, %25, 0, implicit $exec
     %28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %27, 0, implicit $exec
     %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1
-    %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, implicit $exec
+    %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, 0, implicit $exec
     %32:sgpr_32 = S_MOV_B32 6400
     %33:vgpr_32, %34:sreg_64_xexec = V_ADD_CO_U32_e64 %21, %32, 0, implicit $exec
     %35:vgpr_32, dead %36:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %34, 0, implicit $exec
     %37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1
-    %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, implicit $exec
+    %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, 0, implicit $exec
     %39:sgpr_32 = S_MOV_B32 11200
     %40:vgpr_32, %41:sreg_64_xexec = V_ADD_CO_U32_e64 %21, %39, 0, implicit $exec
     %42:vgpr_32, dead %43:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %41, 0, implicit $exec
     %44:vreg_64 = REG_SEQUENCE %40, %subreg.sub0, %42, %subreg.sub1
-    %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, 0, implicit $exec
+    %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, 0, 0, implicit $exec
 ...
 ---
 
@@ -140,17 +140,17 @@ body:             |
     %26:vgpr_32, %27:sreg_64_xexec = V_ADD_CO_U32_e64 %21, %25, 0, implicit $exec
     %28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %27, 0, implicit $exec
     %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1
-    %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, implicit $exec
+    %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, 0, implicit $exec
     %32:sgpr_32 = S_MOV_B32 8192
     %33:vgpr_32, %34:sreg_64_xexec = V_ADD_CO_U32_e64 %21, %32, 0, implicit $exec
     %35:vgpr_32, dead %36:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %34, 0, implicit $exec
     %37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1
-    %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, implicit $exec
+    %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, 0, implicit $exec
     %39:sgpr_32 = S_MOV_B32 10240
     %40:vgpr_32, %41:sreg_64_xexec = V_ADD_CO_U32_e64 %21, %39, 0, implicit $exec
     %42:vgpr_32, dead %43:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %41, 0, implicit $exec
     %44:vreg_64 = REG_SEQUENCE %40, %subreg.sub0, %42, %subreg.sub1
-    %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, 0, implicit $exec
+    %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, 0, 0, implicit $exec
 ...
 ---
 
@@ -186,13 +186,13 @@ body:             |
     %26:vgpr_32, %27:sreg_64_xexec = V_ADD_CO_U32_e64 %21, %25, 0, implicit $exec
     %28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 4294967295, killed %27, 0, implicit $exec
     %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1
-    %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, implicit $exec
+    %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, 0, implicit $exec
 ...
 ---
 
 # GFX9-LABEL: name: 
diff oporder_add_store
-# GFX9: GLOBAL_STORE_DWORD %{{[0-9]+}}, %0.sub0, 1000, 0, 0, 0
-# GFX9: GLOBAL_STORE_DWORD %{{[0-9]+}}, %0.sub1, 0, 0, 0, 0
+# GFX9: GLOBAL_STORE_DWORD %{{[0-9]+}}, %0.sub0, 1000, 0, 0, 0, 0
+# GFX9: GLOBAL_STORE_DWORD %{{[0-9]+}}, %0.sub1, 0, 0, 0, 0, 0
 
 name: 
diff oporder_add_store
 body:             |
@@ -204,11 +204,11 @@ body:             |
     %2:vgpr_32, %3:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub0, %1, 0, implicit $exec
     %4:vgpr_32, dead %5:sreg_64_xexec = V_ADDC_U32_e64 %0.sub1, 0, %3, 0, implicit $exec
     %6:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %4, %subreg.sub1
-    GLOBAL_STORE_DWORD %6, %0.sub0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %6, %0.sub0, 0, 0, 0, 0, 0, implicit $exec
 
     %8:sgpr_32 = S_MOV_B32 3000
     %9:vgpr_32, %10:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub0, %8, 0, implicit $exec
     %11:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 %0.sub1, 0, %10, 0, implicit $exec
     %13:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1
-    GLOBAL_STORE_DWORD %13, %0.sub1, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %13, %0.sub1, 0, 0, 0, 0, 0, implicit $exec
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir
index 26deaf9666de..3b465c5a10fa 100644
--- a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir
+++ b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir
@@ -29,7 +29,7 @@ body: |
   bb.0:
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD %1, %0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %1, %0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -45,7 +45,7 @@ body: |
   bb.0:
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORDX2 %1, %0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX2 %1, %0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -241,15 +241,15 @@ body: |
     %14 = IMPLICIT_DEF
     %15 = IMPLICIT_DEF
     %2 = V_AND_B32_e32 %1, %0, implicit $exec
-    GLOBAL_STORE_DWORD %3, %0, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD %3, %1, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD %3, %2, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD %3, %4, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX2 %3, %5, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %3, %6, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %3, %7, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %3, %8, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %3, %9, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %3, %0, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %3, %1, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %3, %2, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %3, %4, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX2 %3, %5, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %3, %6, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %3, %7, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %3, %8, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %3, %9, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -287,18 +287,18 @@ body: |
     %11 = IMPLICIT_DEF
     %12 = IMPLICIT_DEF
     %2 = V_AND_B32_e32 %1, %0, implicit $exec
-    GLOBAL_STORE_DWORD %3, %0, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD %3, %1, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD %3, %2, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD %3, %4, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX2 %3, %5, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %3, %6, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %3, %7, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %3, %8, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %3, %9, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %3, %10, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %3, %11, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD %3, %12, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %3, %0, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %3, %1, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %3, %2, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %3, %4, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX2 %3, %5, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %3, %6, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %3, %7, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %3, %8, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %3, %9, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %3, %10, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %3, %11, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %3, %12, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 

diff  --git a/llvm/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir b/llvm/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir
index 30d6c4f298f3..746581529402 100644
--- a/llvm/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir
+++ b/llvm/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir
@@ -185,7 +185,7 @@ body:             |
   bb.28:
     %9 = S_FF1_I32_B32 undef %10
     %13 = V_MAD_U32_U24_e64 killed %9, 48, 32, 0, implicit $exec
-    %45 = BUFFER_LOAD_DWORD_OFFEN killed %13, undef %15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
+    %45 = BUFFER_LOAD_DWORD_OFFEN killed %13, undef %15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
     %46 = V_AND_B32_e32 1, killed %45, implicit $exec
     %21 = S_BUFFER_LOAD_DWORD_SGPR undef %22, undef %23, 0, 0 :: (dereferenceable invariant load 4)
     %25 = nofpexcept V_CMP_GE_F32_e64 0, 0, 0, killed %21, 0, implicit $mode, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/regcoalesce-dbg.mir b/llvm/test/CodeGen/AMDGPU/regcoalesce-dbg.mir
index 46d015dacfc3..6f114a1d881a 100644
--- a/llvm/test/CodeGen/AMDGPU/regcoalesce-dbg.mir
+++ b/llvm/test/CodeGen/AMDGPU/regcoalesce-dbg.mir
@@ -70,7 +70,7 @@ body:             |
     %13.sub2_sub3 = COPY killed %12
     %20 = V_LSHL_B64_e64 killed %19, 2, implicit $exec
     %16 = COPY killed %5
-    BUFFER_STORE_DWORD_ADDR64 killed %16, killed %20, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out)
+    BUFFER_STORE_DWORD_ADDR64 killed %16, killed %20, killed %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out)
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir
index d03f60cc6835..97d6a0c89370 100644
--- a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir
+++ b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir
@@ -62,12 +62,12 @@ body:             |
 
   bb.3:
     %1 = COPY killed %17
-    FLAT_STORE_DWORD undef %10, %1.sub2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD undef %10, %1.sub2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     %14 = COPY %1.sub1
     %16 = COPY killed %1.sub0
     undef %15.sub0 = COPY killed %16
     %15.sub1 = COPY killed %14
-    FLAT_STORE_DWORDX2 undef %11, killed %15, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORDX2 undef %11, killed %15, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 
 ...
@@ -132,10 +132,10 @@ body:             |
     %6.sub2 = COPY %6.sub0
 
   bb.2:
-    BUFFER_STORE_DWORD_OFFEN %6.sub3, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, 0, 0, 0, implicit $exec
-    BUFFER_STORE_DWORD_OFFEN %6.sub2, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec
-    BUFFER_STORE_DWORD_OFFEN %6.sub1, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec
-    BUFFER_STORE_DWORD_OFFEN %6.sub0, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFEN %6.sub3, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFEN %6.sub2, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFEN %6.sub1, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFEN %6.sub0, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $sgpr30_sgpr31 = COPY %5
     S_SETPC_B64_return $sgpr30_sgpr31
 

diff  --git a/llvm/test/CodeGen/AMDGPU/reserved-reg-in-clause.mir b/llvm/test/CodeGen/AMDGPU/reserved-reg-in-clause.mir
new file mode 100644
index 000000000000..b5a26408ddca
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/reserved-reg-in-clause.mir
@@ -0,0 +1,28 @@
+# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass si-form-memory-clauses %s -o - | FileCheck -check-prefix=GCN %s
+
+# Make sure we do not produce early-clobber list with odd subregs.
+# Odd vector subregs are reserved on gfx90a and verifier complaints after RA.
+
+# GCN-LABEL: name: long_reg_clause
+# GCN: dead early-clobber %2.sub0_sub1_sub2_sub3:areg_512, undef early-clobber %4.sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:areg_512, dead early-clobber %3:areg_512 = BUNDLE %0, implicit $exec {
+---
+name: long_reg_clause
+body:             |
+  bb.0.entry:
+    %0:vreg_64 = IMPLICIT_DEF
+    undef %1.sub12_sub13_sub14_sub15:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -208, 0, 0, 0, 0, implicit $exec
+    %1.sub8_sub9_sub10_sub11:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -224, 0, 0, 0, 0, implicit $exec
+    %1.sub4_sub5_sub6_sub7:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -240, 0, 0, 0, 0, implicit $exec
+    dead %1.sub0_sub1_sub2_sub3:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -256, 0, 0, 0, 0, implicit $exec
+    undef %2.sub12_sub13_sub14_sub15:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -80, 0, 0, 0, 0, implicit $exec
+    %2.sub8_sub9_sub10_sub11:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -96, 0, 0, 0, 0, implicit $exec
+    %2.sub4_sub5_sub6_sub7:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -112, 0, 0, 0, 0, implicit $exec
+    dead %2.sub0_sub1_sub2_sub3:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -128, 0, 0, 0, 0, implicit $exec
+    undef %3.sub12_sub13_sub14_sub15:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, 0, implicit $exec
+    %3.sub8_sub9_sub10_sub11:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, 0, implicit $exec
+    %3.sub4_sub5_sub6_sub7:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, 0, implicit $exec
+    dead %3.sub0_sub1_sub2_sub3:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec
+    undef %4.sub12_sub13_sub14_sub15:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 176, 0, 0, 0, 0, implicit $exec
+    %4.sub8_sub9_sub10_sub11:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 160, 0, 0, 0, 0, implicit $exec
+    %4.sub4_sub5_sub6_sub7:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 144, 0, 0, 0, 0, implicit $exec
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/reserved-vgpr-tuples.mir b/llvm/test/CodeGen/AMDGPU/reserved-vgpr-tuples.mir
new file mode 100644
index 000000000000..61f54a49fa86
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/reserved-vgpr-tuples.mir
@@ -0,0 +1,248 @@
+# RUN: llc -march=amdgcn -mcpu=gfx908 -start-before=greedy -stop-after=virtregrewriter -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX908 %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=greedy -stop-after=virtregrewriter -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX90A %s
+
+---
+# GCN-LABEL: name: alloc_vgpr_64
+# GFX908: $vgpr3_vgpr4 = GLOBAL_LOAD
+# GFX90A: $vgpr4_vgpr5 = GLOBAL_LOAD
+name:            alloc_vgpr_64
+tracksRegLiveness: true
+liveins:
+  - { reg: '$vgpr0_vgpr1' }
+  - { reg: '$vgpr2' }
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2
+
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %1:vgpr_32 = COPY $vgpr2
+    %2:vreg_64 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX2 %0, %2, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+# GCN-LABEL: name: alloc_vgpr_96
+# GFX908: $vgpr3_vgpr4_vgpr5 = GLOBAL_LOAD
+# GFX90A: $vgpr4_vgpr5_vgpr6 = GLOBAL_LOAD
+name:            alloc_vgpr_96
+tracksRegLiveness: true
+liveins:
+  - { reg: '$vgpr0_vgpr1' }
+  - { reg: '$vgpr2' }
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2
+
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %1:vgpr_32 = COPY $vgpr2
+    %2:vreg_96 = GLOBAL_LOAD_DWORDX3 %0, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX3 %0, %2, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+# GCN-LABEL: name: alloc_vgpr_128
+# GFX908: $vgpr3_vgpr4_vgpr5_vgpr6 = GLOBAL_LOAD
+# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7 = GLOBAL_LOAD
+name:            alloc_vgpr_128
+tracksRegLiveness: true
+liveins:
+  - { reg: '$vgpr0_vgpr1' }
+  - { reg: '$vgpr2' }
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2
+
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %1:vgpr_32 = COPY $vgpr2
+    %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+# GCN-LABEL: name: alloc_vgpr_160
+# GFX908: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_LOAD
+# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 = IMAGE_LOAD
+name:            alloc_vgpr_160
+tracksRegLiveness: true
+liveins:
+  - { reg: '$vgpr0_vgpr1' }
+  - { reg: '$vgpr2' }
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2
+
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %1:vgpr_32 = COPY $vgpr2
+    %2:vreg_160 = IMAGE_LOAD_V5_V1 %1, undef %3:sgpr_256, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+# GCN-LABEL: name: alloc_vgpr_256
+# GFX908: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 = COPY
+# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = COPY
+name:            alloc_vgpr_256
+tracksRegLiveness: true
+liveins:
+  - { reg: '$vgpr0_vgpr1' }
+  - { reg: '$vgpr2' }
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2
+
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %1:vgpr_32 = COPY $vgpr2
+    %3:sgpr_256 = IMPLICIT_DEF
+    %2:vreg_256 = COPY %3:sgpr_256
+    %4:vreg_128 = IMAGE_SAMPLE_C_CL_O_V4_V8 %2, %3:sgpr_256, undef %5:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+    GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+# GCN-LABEL: name: alloc_vgpr_512
+# GFX908: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18 = IMPLICIT_DEF
+# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
+name:            alloc_vgpr_512
+tracksRegLiveness: true
+liveins:
+  - { reg: '$vgpr0_vgpr1' }
+  - { reg: '$vgpr2' }
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2
+
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %1:vgpr_32 = COPY $vgpr2
+    %2:vreg_512 = IMPLICIT_DEF
+    GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2.sub4_sub5_sub6_sub7, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2.sub8_sub9_sub10_sub11, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2.sub12_sub13_sub14_sub15, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+# GCN-LABEL: name: alloc_vgpr_1024
+# GFX908: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34 = IMPLICIT_DEF
+# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
+name:            alloc_vgpr_1024
+tracksRegLiveness: true
+liveins:
+  - { reg: '$vgpr0_vgpr1' }
+  - { reg: '$vgpr2' }
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2
+
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %1:vgpr_32 = COPY $vgpr2
+    %2:vreg_1024 = IMPLICIT_DEF
+    GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2.sub4_sub5_sub6_sub7, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2.sub8_sub9_sub10_sub11, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2.sub12_sub13_sub14_sub15, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2.sub16_sub17_sub18_sub19, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2.sub20_sub21_sub22_sub23, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2.sub24_sub25_sub26_sub27, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2.sub28_sub29_sub30_sub31, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+# GCN-LABEL: name: alloc_agpr_64
+# GFX908: $agpr1_agpr2 = IMPLICIT_DEF
+# GFX90A: $agpr2_agpr3 = IMPLICIT_DEF
+name:            alloc_agpr_64
+tracksRegLiveness: true
+liveins:
+  - { reg: '$vgpr0_vgpr1' }
+  - { reg: '$agpr0' }
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $agpr0
+
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %3:areg_64 = IMPLICIT_DEF
+    %2:vreg_64 = COPY %3:areg_64
+    GLOBAL_STORE_DWORDX2 %0, %2, 0, 0, 0, 0, 0, implicit $exec
+    %1:vgpr_32 = COPY $agpr0
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+# GCN-LABEL: name: alloc_agpr_128
+# GFX908: $agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
+# GFX90A: $agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF
+name:            alloc_agpr_128
+tracksRegLiveness: true
+liveins:
+  - { reg: '$vgpr0_vgpr1' }
+  - { reg: '$agpr0' }
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $agpr0
+
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %3:areg_128 = IMPLICIT_DEF
+    %2:vreg_128 = COPY %3:areg_128
+    GLOBAL_STORE_DWORDX4 %0, %2, 0, 0, 0, 0, 0, implicit $exec
+    %1:vgpr_32 = COPY $agpr0
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+# GCN-LABEL: name: alloc_agpr_512
+# GFX908: $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 = IMPLICIT_DEF
+# GFX90A: $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17 = IMPLICIT_DEF
+name:            alloc_agpr_512
+tracksRegLiveness: true
+liveins:
+  - { reg: '$vgpr0_vgpr1' }
+  - { reg: '$agpr0' }
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $agpr0
+
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %3:areg_512 = IMPLICIT_DEF
+    %2:vreg_512 = COPY %3:areg_512
+    GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2.sub4_sub5_sub6_sub7, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2.sub8_sub9_sub10_sub11, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2.sub12_sub13_sub14_sub15, 0, 0, 0, 0, 0, implicit $exec
+    %1:vgpr_32 = COPY $agpr0
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+# GCN-LABEL: name: alloc_agpr_1024
+# GFX908: $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31_agpr32 = IMPLICIT_DEF
+# GFX90A: $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31_agpr32_agpr33 = IMPLICIT_DEF
+name:            alloc_agpr_1024
+tracksRegLiveness: true
+liveins:
+  - { reg: '$vgpr0_vgpr1' }
+  - { reg: '$agpr0' }
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $agpr0
+
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %3:areg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = COPY %3:areg_1024
+    GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2.sub4_sub5_sub6_sub7, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2.sub8_sub9_sub10_sub11, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2.sub12_sub13_sub14_sub15, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2.sub16_sub17_sub18_sub19, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2.sub20_sub21_sub22_sub23, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2.sub24_sub25_sub26_sub27, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %0, %2.sub28_sub29_sub30_sub31, 0, 0, 0, 0, 0, implicit $exec
+    %1:vgpr_32 = COPY $agpr0
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/s_code_end.ll b/llvm/test/CodeGen/AMDGPU/s_code_end.ll
index 2c7ce12f8f30..362d6b8497b4 100644
--- a/llvm/test/CodeGen/AMDGPU/s_code_end.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_code_end.ll
@@ -4,6 +4,9 @@
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10NOEND %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx1010 -d - | FileCheck --check-prefixes=GCN,GCN-OBJ,GFX10NOEND,GFX10NOEND-OBJ %s
 
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX90AEND-ASM %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx90a --disassemble - | FileCheck -check-prefixes=GCN,GCN-OBJ,GFX90AEND-OBJ %s
+
 ; GCN:            a_kernel1{{>?}}:
 ; GCN:                    s_endpgm
 ; GCN-ASM:        [[END_LABEL1:\.Lfunc_end.*]]:
@@ -11,7 +14,7 @@
 
 ; GCN-OBJ-NEXT:           s_nop 0
 
-define amdgpu_kernel void @a_kernel1() {
+define amdgpu_kernel void @a_kernel1() #0 {
   ret void
 }
 
@@ -22,7 +25,7 @@ define amdgpu_kernel void @a_kernel1() {
 
 ; GCN-OBJ:   {{^$}}
 
-define amdgpu_kernel void @a_kernel2() {
+define amdgpu_kernel void @a_kernel2() #0 {
   ret void
 }
 
@@ -35,15 +38,22 @@ define amdgpu_kernel void @a_kernel2() {
 ; GCN-ASM-NEXT:   [[END_LABEL3:\.Lfunc_end.*]]:
 ; GCN-ASM-NEXT:           .size   a_function, [[END_LABEL3]]-a_function
 ; GFX10END-ASM:           .p2alignl 6, 3214868480
+; GFX90AEND-ASM:          .p2alignl 6, 3212836864
 ; GFX10END-ASM-NEXT:      .fill 48, 4, 3214868480
+; GFX90AEND-ASM-NEXT:     .fill 256, 4, 3212836864
 ; GFX10NOEND-NOT:         .fill
 
 ; GFX10NOEND-OBJ-NOT:     s_code_end
 ; GFX10END-OBJ-NEXT:      s_code_end
+; GFX90AEND-OBJ-NEXT:     s_nop 0
 
 ; GFX10END-OBJ:           s_code_end // 000000000140:
 ; GFX10END-OBJ-COUNT-47:  s_code_end
+; GFX90AEND-OBJ:           s_nop 0 // 000000000140:
+; GFX90AEND-OBJ-COUNT-255: s_nop 0
 
-define void @a_function() {
+define void @a_function() #0 {
   ret void
 }
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1,512" }

diff  --git a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
index d10192d8f098..f5fdfbf18113 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
@@ -27,7 +27,7 @@ body:             |
   ; CHECK:   [[COPY:%[0-9]+]]:vreg_512 = COPY %0
   ; CHECK: bb.1:
   ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, align 8, addrspace 5)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, align 8, addrspace 5)
   ; CHECK:   dead %6:vgpr_32 = DS_READ_B32_gfx9 undef %7:vgpr_32, 0, 0, implicit $exec
   ; CHECK:   dead %8:vreg_64 = DS_READ_B64_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec
   ; CHECK:   dead %9:vreg_128 = DS_READ_B128_gfx9 [[V_ADD_U32_e32_]], 0, 0, implicit $exec
@@ -51,7 +51,7 @@ body:             |
     %4:vreg_512 = COPY %0
 
   bb.1:
-    BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, align 8, addrspace 5)
+    BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, align 8, addrspace 5)
     %6:vgpr_32 = DS_READ_B32_gfx9 undef %7:vgpr_32, 0, 0, implicit $exec
     %8:vreg_64 = DS_READ_B64_gfx9 %1, 0, 0, implicit $exec
     %9:vreg_128 = DS_READ_B128_gfx9 %2, 0, 0, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
index e2c31bbd2db9..dffe920aacef 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
@@ -23,12 +23,12 @@ body:             |
   ; CHECK:   liveins: $vgpr0
   ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK:   [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, 0, 0, implicit $exec
-  ; CHECK:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, 0, 0, implicit $exec
+  ; CHECK:   [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, 0, 0, 0, implicit $exec
+  ; CHECK:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, 0, 0, 0, implicit $exec
   ; CHECK:   [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]]
   ; CHECK:   undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $mode, implicit $exec
   ; CHECK:   dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $mode, implicit $exec
-  ; CHECK:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec
+  ; CHECK:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
   ; CHECK:   undef %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
   ; CHECK:   [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK:   [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
@@ -42,17 +42,17 @@ body:             |
   ; CHECK:   %19.sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec
   ; CHECK:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK:   %4.sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec
-  ; CHECK:   GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, 0, 0, implicit $exec
+  ; CHECK:   GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, 0, 0, 0, implicit $exec
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK:   %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF2]], 0, 0, 0, 0, implicit $exec
-  ; CHECK:   [[DEF1]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, 0, 0, implicit $exec
-  ; CHECK:   dead %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, 0, 0, implicit $exec
-  ; CHECK:   dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF6]], 0, 0, 0, 0, implicit $exec
+  ; CHECK:   %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF2]], 0, 0, 0, 0, 0, implicit $exec
+  ; CHECK:   [[DEF1]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, 0, 0, 0, implicit $exec
+  ; CHECK:   dead %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, 0, 0, 0, implicit $exec
+  ; CHECK:   dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF6]], 0, 0, 0, 0, 0, implicit $exec
   ; CHECK:   [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64_e64 2, [[DEF1]], implicit $exec
-  ; CHECK:   dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF7]], 0, 0, 0, 0, implicit $exec
+  ; CHECK:   dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF7]], 0, 0, 0, 0, 0, implicit $exec
   ; CHECK:   S_NOP 0, implicit [[DEF5]], implicit [[V_LSHLREV_B64_e64_]].sub0, implicit [[DEF4]], implicit [[V_MOV_B32_e32_]]
-  ; CHECK:   GLOBAL_STORE_DWORD [[DEF7]], [[V_MOV_B32_e32_1]], 0, 0, 0, 0, implicit $exec
+  ; CHECK:   GLOBAL_STORE_DWORD [[DEF7]], [[V_MOV_B32_e32_1]], 0, 0, 0, 0, 0, implicit $exec
   ; CHECK: bb.1:
   ; CHECK:   successors: %bb.2(0x80000000)
   ; CHECK:   S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode
@@ -69,14 +69,14 @@ body:             |
 
     %0:vgpr_32 = COPY $vgpr0
     %1:vreg_64 = IMPLICIT_DEF
-    %2:vreg_64 = GLOBAL_LOAD_DWORDX2 %1, 0, 0, 0, 0, implicit $exec
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD %1, 8, 0, 0, 0, implicit $exec
+    %2:vreg_64 = GLOBAL_LOAD_DWORDX2 %1, 0, 0, 0, 0, 0, implicit $exec
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %1, 8, 0, 0, 0, 0, implicit $exec
     undef %4.sub1:vreg_64 = V_ADD_U32_e32 %0, %0, implicit $exec
     %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
     %5:vreg_64 = COPY %2
     undef %6.sub0:vreg_64 = V_ADD_F32_e32 %1.sub0, %5.sub0, implicit $mode, implicit $exec
     %6.sub1:vreg_64 = V_ADD_F32_e32 %1.sub1, %5.sub0, implicit $mode, implicit $exec
-    %7:vgpr_32 = GLOBAL_LOAD_DWORD %5, 0, 0, 0, 0, implicit $exec
+    %7:vgpr_32 = GLOBAL_LOAD_DWORD %5, 0, 0, 0, 0, 0, implicit $exec
     %8:vreg_64 = IMPLICIT_DEF
     %9:vreg_64 = IMPLICIT_DEF
     %10:vreg_64 = IMPLICIT_DEF
@@ -90,15 +90,15 @@ body:             |
     %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     undef %19.sub0:vreg_64 = V_ADD_F32_e32 %7, %2.sub0, implicit $mode, implicit $exec
     %19.sub1:vreg_64 = V_ADD_F32_e32 %3, %3, implicit $mode, implicit $exec
-    GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, 0, 0, implicit $exec
-    %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD %9, 0, 0, 0, 0, implicit $exec
-    %8.sub0:vreg_64 = GLOBAL_LOAD_DWORD %10, 0, 0, 0, 0, implicit $exec
-    %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, 0, 0, implicit $exec
-    %21:vgpr_32 = GLOBAL_LOAD_DWORD %14, 0, 0, 0, 0, implicit $exec
-    %22:vgpr_32 = GLOBAL_LOAD_DWORD %15, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, 0, 0, 0, implicit $exec
+    %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD %9, 0, 0, 0, 0, 0, implicit $exec
+    %8.sub0:vreg_64 = GLOBAL_LOAD_DWORD %10, 0, 0, 0, 0, 0, implicit $exec
+    %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, 0, 0, 0, implicit $exec
+    %21:vgpr_32 = GLOBAL_LOAD_DWORD %14, 0, 0, 0, 0, 0, implicit $exec
+    %22:vgpr_32 = GLOBAL_LOAD_DWORD %15, 0, 0, 0, 0, 0, implicit $exec
     %23:vreg_64 = V_LSHLREV_B64_e64 2, %8, implicit $exec
     S_NOP 0, implicit %13, implicit %23.sub0, implicit %12, implicit %17
-    GLOBAL_STORE_DWORD %15, %18, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %15, %18, 0, 0, 0, 0, 0, implicit $exec
 
   bb.1:
     S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode

diff  --git a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
index dc2f8276fc00..06bbc9450fa7 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
@@ -217,7 +217,7 @@ body:             |
     %19:sreg_32_xm0_xexec = IMPLICIT_DEF
     %20:vgpr_32 = V_ADD_CO_U32_e32 %19, %0, implicit-def dead $vcc, implicit $exec
     %21:vreg_64, dead %22:sreg_64 = V_MAD_I64_I32_e64 %20, 12, %7, 0, implicit $exec
-    %23:vgpr_32 = GLOBAL_LOAD_DWORD %21, 4, 0, 0, 0, implicit $exec
+    %23:vgpr_32 = GLOBAL_LOAD_DWORD %21, 4, 0, 0, 0, 0, implicit $exec
     %24:vreg_64, dead %25:sreg_64 = V_MAD_I64_I32_e64 %20, 48, %8, 0, implicit $exec
     %26:vreg_128 = IMPLICIT_DEF
     undef %27.sub0:sreg_64_xexec = S_LOAD_DWORD_IMM %6, 0, 0, 0
@@ -232,14 +232,14 @@ body:             |
     %33:sgpr_32 = S_ADDC_U32 %5.sub1, %31.sub1, implicit-def dead $scc, implicit killed $scc
     %34:vgpr_32 = IMPLICIT_DEF
     %35:vreg_64, dead %36:sreg_64 = V_MAD_I64_I32_e64 %23, %34, 0, 0, implicit $exec
-    %37:vreg_64 = GLOBAL_LOAD_DWORDX2 %35, 32, 0, 0, 0, implicit $exec
+    %37:vreg_64 = GLOBAL_LOAD_DWORDX2 %35, 32, 0, 0, 0, 0, implicit $exec
     undef %38.sub1:vreg_64 = V_ASHRREV_I32_e32 31, %37.sub0, implicit $exec
     %38.sub0:vreg_64 = COPY %37.sub0
     %39:vreg_64 = V_LSHLREV_B64_e64 3, %38, implicit $exec
     undef %40.sub0:vreg_64, %41:sreg_64_xexec = V_ADD_CO_U32_e64 0, %39.sub0, 0, implicit $exec
     %42:vgpr_32 = COPY %33
     %40.sub1:vreg_64, dead %43:sreg_64_xexec = V_ADDC_U32_e64 %42, %39.sub1, %41, 0, implicit $exec
-    %44:vreg_64 = GLOBAL_LOAD_DWORDX2 %40, 0, 0, 0, 0, implicit $exec :: (load 8 from %ir.tmp34)
+    %44:vreg_64 = GLOBAL_LOAD_DWORDX2 %40, 0, 0, 0, 0, 0, implicit $exec :: (load 8 from %ir.tmp34)
     undef %45.sub1:vreg_64 = IMPLICIT_DEF
     %45.sub0:vreg_64 = COPY %37.sub1
     %46:vreg_64 = V_LSHLREV_B64_e64 3, %45, implicit $exec
@@ -247,7 +247,7 @@ body:             |
     %49:vgpr_32 = COPY %33
     %47.sub1:vreg_64, dead %50:sreg_64_xexec = V_ADDC_U32_e64 %49, %46.sub1, %48, 0, implicit $exec
     %51:vreg_64 = IMPLICIT_DEF
-    undef %52.sub0:vreg_64 = GLOBAL_LOAD_DWORD %35, 40, 0, 0, 0, implicit $exec :: (load 4 from %ir.18 + 8)
+    undef %52.sub0:vreg_64 = GLOBAL_LOAD_DWORD %35, 40, 0, 0, 0, 0, implicit $exec :: (load 4 from %ir.18 + 8)
     %52.sub1:vreg_64 = IMPLICIT_DEF
     %53:vreg_64 = V_LSHLREV_B64_e64 3, %52, implicit $exec
     undef %54.sub0:vreg_64, %55:sreg_64_xexec = V_ADD_CO_U32_e64 0, %53.sub0, 0, implicit $exec
@@ -258,31 +258,31 @@ body:             |
     %59:sreg_64 = IMPLICIT_DEF
     %60:sreg_32_xm0 = S_ADD_U32 %5.sub0, %59.sub0, implicit-def $scc
     %61:sgpr_32 = S_ADDC_U32 %5.sub1, %59.sub1, implicit-def dead $scc, implicit killed $scc
-    %62:vreg_64 = GLOBAL_LOAD_DWORDX2 %35, 0, 0, 0, 0, implicit $exec :: (load 8 from %ir.20, align 4)
+    %62:vreg_64 = GLOBAL_LOAD_DWORDX2 %35, 0, 0, 0, 0, 0, implicit $exec :: (load 8 from %ir.20, align 4)
     undef %63.sub1:vreg_64 = V_ASHRREV_I32_e32 31, %62.sub0, implicit $exec
     %63.sub0:vreg_64 = COPY %62.sub0
     %64:vreg_64 = IMPLICIT_DEF
     undef %65.sub0:vreg_64, %66:sreg_64_xexec = V_ADD_CO_U32_e64 %60, %64.sub0, 0, implicit $exec
     %67:vgpr_32 = COPY %61
     %65.sub1:vreg_64, dead %68:sreg_64_xexec = V_ADDC_U32_e64 %67, %64.sub1, %66, 0, implicit $exec
-    %69:vreg_128 = GLOBAL_LOAD_DWORDX4 %65, 0, 0, 0, 0, implicit $exec :: (load 16 from %ir.tmp58)
+    %69:vreg_128 = GLOBAL_LOAD_DWORDX4 %65, 0, 0, 0, 0, 0, implicit $exec :: (load 16 from %ir.tmp58)
     undef %70.sub1:vreg_64 = IMPLICIT_DEF
     %70.sub0:vreg_64 = IMPLICIT_DEF
     %71:vreg_64 = IMPLICIT_DEF
     undef %72.sub0:vreg_64, %73:sreg_64_xexec = V_ADD_CO_U32_e64 %60, %71.sub0, 0, implicit $exec
     %74:vgpr_32 = COPY %61
     %72.sub1:vreg_64, dead %75:sreg_64_xexec = V_ADDC_U32_e64 0, %71.sub1, %73, 0, implicit $exec
-    %76:vreg_128 = GLOBAL_LOAD_DWORDX4 %72, 0, 0, 0, 0, implicit $exec
+    %76:vreg_128 = GLOBAL_LOAD_DWORDX4 %72, 0, 0, 0, 0, 0, implicit $exec
     %77:vgpr_32 = IMPLICIT_DEF
     %78:vgpr_32 = IMPLICIT_DEF
     %79:vgpr_32 = nofpexcept V_MUL_F32_e32 0, %77, implicit $mode, implicit $exec
     %80:vgpr_32 = IMPLICIT_DEF
     %81:vgpr_32 = IMPLICIT_DEF
     %84:vgpr_32 = IMPLICIT_DEF
-    BUFFER_STORE_DWORD_OFFEN %84, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 108, 0, 0, 0, 0, 0, implicit $exec
-    BUFFER_STORE_DWORD_OFFEN %81, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 104, 0, 0, 0, 0, 0, implicit $exec
-    BUFFER_STORE_DWORD_OFFEN %80, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 100, 0, 0, 0, 0, 0, implicit $exec
-    BUFFER_STORE_DWORD_OFFEN %78, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 96, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFEN %84, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 108, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFEN %81, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 104, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFEN %80, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 100, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFEN %78, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 96, 0, 0, 0, 0, 0, 0, implicit $exec
     %85:vgpr_32 = IMPLICIT_DEF
     %86:vgpr_32 = IMPLICIT_DEF
     %87:vgpr_32 = IMPLICIT_DEF

diff  --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
index 5e24f774f700..e5b48d903aa5 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
@@ -25,7 +25,7 @@ body:             |
   ; CHECK:   successors: %bb.1(0x80000000)
   ; CHECK:   liveins: $sgpr4_sgpr5
   ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %2:vgpr_32, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr101, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %2:vgpr_32, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr101, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
   ; CHECK:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sgpr_64 = S_LOAD_DWORDX2_IMM [[COPY]](p4), 0, 0, 0 :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 5329
   ; CHECK:   undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
@@ -37,7 +37,7 @@ body:             |
   ; CHECK: bb.1:
   ; CHECK:   successors: %bb.1(0x80000000)
   ; CHECK:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_LO16 */, def dead %11
-  ; CHECK:   GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+  ; CHECK:   GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
   ; CHECK:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK:   [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK:   [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load 8, addrspace 3)
@@ -52,7 +52,7 @@ body:             |
   ; CHECK:   DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store 4, addrspace 3)
   ; CHECK:   DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store 4, addrspace 3)
   ; CHECK:   DS_WRITE_B64_gfx9 undef %30:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3)
-  ; CHECK:   undef %31.sub1:vreg_64 = FLAT_LOAD_DWORD undef %32:vreg_64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+  ; CHECK:   undef %31.sub1:vreg_64 = FLAT_LOAD_DWORD undef %32:vreg_64, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
   ; CHECK:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 1, [[DEF2]], implicit $exec
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; CHECK:   [[DEF]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_]]
@@ -71,7 +71,7 @@ body:             |
   ; CHECK:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %42, 0, 0, 0 :: (load 4, addrspace 1)
   ; CHECK:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; CHECK:   [[DS_READ_B32_gfx9_4:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %45:vgpr_32, 0, 0, implicit $exec :: (load 4, addrspace 3)
-  ; CHECK:   GLOBAL_STORE_DWORD undef %46:vreg_64, [[DS_READ_B32_gfx9_4]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+  ; CHECK:   GLOBAL_STORE_DWORD undef %46:vreg_64, [[DS_READ_B32_gfx9_4]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
   ; CHECK:   %31.sub0:vreg_64 = COPY [[S_LOAD_DWORD_IMM]], implicit $exec
   ; CHECK:   DS_WRITE_B64_gfx9 undef %47:vgpr_32, %31, 0, 0, implicit $exec :: (store 8, addrspace 3)
   ; CHECK:   S_BRANCH %bb.1
@@ -79,7 +79,7 @@ body:             |
     liveins: $sgpr4_sgpr5
 
     %0:sgpr_64(p4) = COPY $sgpr4_sgpr5
-    %1:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %2:vgpr_32, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr101, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    %1:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %2:vgpr_32, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr101, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     %3:sgpr_64 = S_LOAD_DWORDX2_IMM %0(p4), 0, 0, 0 :: (dereferenceable invariant load 8, align 16, addrspace 4)
     %4:sreg_32_xm0 = S_MOV_B32 5329
     undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
@@ -91,7 +91,7 @@ body:             |
 
   bb.1:
     INLINEASM &"", 1, 851978, def %11:vgpr_32
-    GLOBAL_STORE_DWORD undef %12:vreg_64, %1, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    GLOBAL_STORE_DWORD undef %12:vreg_64, %1, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     %13:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load 8, addrspace 3)
     INLINEASM &"def $0 $1", 1, 851978, def %15:vgpr_32, 851978, def %16:vgpr_32
     %17:vgpr_32 = DS_READ_B32_gfx9 %6, 0, 0, implicit $exec
@@ -108,7 +108,7 @@ body:             |
     DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store 4, addrspace 3)
     DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store 4, addrspace 3)
     DS_WRITE_B64_gfx9 undef %30:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3)
-    undef %31.sub1:vreg_64 = FLAT_LOAD_DWORD undef %32:vreg_64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    undef %31.sub1:vreg_64 = FLAT_LOAD_DWORD undef %32:vreg_64, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
     %33:vgpr_32 = V_MUL_LO_U32_e64 %25, %4, implicit $exec
     %10:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %25, %26, implicit $exec
     %34:vgpr_32 = V_SUB_U32_e32 %33, %9, implicit $exec
@@ -125,7 +125,7 @@ body:             |
     %43:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %42, 0, 0, 0 :: (load 4, addrspace 1)
     INLINEASM &"", 1
     %44:vgpr_32 = DS_READ_B32_gfx9 undef %45:vgpr_32, 0, 0, implicit $exec :: (load 4, addrspace 3)
-    GLOBAL_STORE_DWORD undef %46:vreg_64, %44, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    GLOBAL_STORE_DWORD undef %46:vreg_64, %44, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     %31.sub0:vreg_64 = COPY %43, implicit $exec
     DS_WRITE_B64_gfx9 undef %47:vgpr_32, %31, 0, 0, implicit $exec :: (store 8, addrspace 3)
     S_BRANCH %bb.1

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir
index 117e6f588162..e404ce3ceaad 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir
+++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir
@@ -12,15 +12,15 @@ body: |
     ; CHECK-LABEL: name: denorm_mode_not_barrier
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4)
-    ; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, 0, 0, implicit $exec :: (load 4)
+    ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    ; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, 0, 0, 0, implicit $exec :: (load 4)
     ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD1]], implicit $exec
     ; CHECK: S_DENORM_MODE 0, implicit-def $mode, implicit $mode
     ; CHECK: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]]
     %0:vreg_64 = COPY $vgpr0_vgpr1
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec :: (load 4)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, 0, implicit $exec :: (load 4)
     %3:vgpr_32 = V_ADD_U32_e32 %1, %2, implicit $exec
     S_ENDPGM 0, implicit %3
 ...
@@ -35,15 +35,15 @@ body: |
     ; CHECK-LABEL: name: round_mode_not_barrier
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4)
-    ; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, 0, 0, implicit $exec :: (load 4)
+    ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    ; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, 0, 0, 0, implicit $exec :: (load 4)
     ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD1]], implicit $exec
     ; CHECK: S_ROUND_MODE 0, implicit-def $mode, implicit $mode
     ; CHECK: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]]
     %0:vreg_64 = COPY $vgpr0_vgpr1
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
     S_ROUND_MODE 0, implicit-def $mode, implicit $mode
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec :: (load 4)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, 0, implicit $exec :: (load 4)
     %3:vgpr_32 = V_ADD_U32_e32 %1, %2, implicit $exec
     S_ENDPGM 0, implicit %3
 ...
@@ -58,17 +58,17 @@ body: |
     ; CHECK-LABEL: name: denorm_mode_mode_def_use
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4)
-    ; CHECK: dead %3:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, 0, 0, implicit $exec :: (load 4)
+    ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    ; CHECK: dead %3:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, 0, 0, 0, implicit $exec :: (load 4)
     ; CHECK: S_DENORM_MODE 0, implicit-def $mode, implicit $mode
     ; CHECK: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 0, [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec
     ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORD]], [[V_ADD_F32_e32_]], implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_ADD_F32_e32_]], implicit [[V_ADD_U32_e32_]]
     %0:vreg_64 = COPY $vgpr0_vgpr1
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
     S_DENORM_MODE 0, implicit-def $mode, implicit $mode
     %2:vgpr_32 = V_ADD_F32_e32 0, %1, implicit $mode, implicit $exec
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec :: (load 4)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, 0, implicit $exec :: (load 4)
     %4:vgpr_32 = V_ADD_U32_e32 %1, %2, implicit $exec
     S_ENDPGM 0, implicit %2, implicit %4
 ...
@@ -83,17 +83,17 @@ body: |
     ; CHECK-LABEL: name: round_mode_mode_def_use
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4)
-    ; CHECK: dead %3:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, 0, 0, implicit $exec :: (load 4)
+    ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    ; CHECK: dead %3:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, 0, 0, 0, implicit $exec :: (load 4)
     ; CHECK: S_ROUND_MODE 0, implicit-def $mode, implicit $mode
     ; CHECK: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 0, [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec
     ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORD]], [[V_ADD_F32_e32_]], implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_ADD_F32_e32_]], implicit [[V_ADD_U32_e32_]]
     %0:vreg_64 = COPY $vgpr0_vgpr1
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
     S_ROUND_MODE 0, implicit-def $mode, implicit $mode
     %2:vgpr_32 = V_ADD_F32_e32 0, %1, implicit $mode, implicit $exec
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec :: (load 4)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, 0, implicit $exec :: (load 4)
     %4:vgpr_32 = V_ADD_U32_e32 %1, %2, implicit $exec
     S_ENDPGM 0, implicit %2, implicit %4
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir
index 11a4b7a5dbdd..0211f4294f75 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir
+++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir
@@ -1,18 +1,43 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=machine-scheduler -o - %s | FileCheck %s
 
 ---
 # Check that the high latency loads are both scheduled first, before the
 # multiplies, despite the presence of a barrier in the function.
-# CHECK: BUFFER_LOAD_DWORD_OFFSET
-# CHECK: BUFFER_LOAD_DWORD_OFFSET
-# CHECK: V_MUL_LO_U32_e64
-# CHECK: V_MUL_LO_U32_e64
 name: test
 tracksRegLiveness: true
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
 
+    ; CHECK-LABEL: name: test
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
+    ; CHECK: undef %0.sub3:vreg_128 = COPY $vgpr9
+    ; CHECK: undef %1.sub2:vreg_128 = COPY $vgpr8
+    ; CHECK: undef %2.sub1:vreg_128 = COPY $vgpr7
+    ; CHECK: undef %8.sub1:vreg_64 = COPY $vgpr1
+    ; CHECK: %8.sub0:vreg_64 = COPY $vgpr0
+    ; CHECK: undef %3.sub0:vreg_128 = COPY $vgpr6
+    ; CHECK: undef %4.sub3:vreg_128 = COPY $vgpr5
+    ; CHECK: undef %5.sub2:vreg_128 = COPY $vgpr4
+    ; CHECK: undef %6.sub1:vreg_128 = COPY $vgpr3
+    ; CHECK: undef %7.sub0:vreg_128 = COPY $vgpr2
+    ; CHECK: undef %9.sub0:sgpr_128 = V_READFIRSTLANE_B32 %7.sub0, implicit $exec
+    ; CHECK: %9.sub1:sgpr_128 = V_READFIRSTLANE_B32 %6.sub1, implicit $exec
+    ; CHECK: %9.sub2:sgpr_128 = V_READFIRSTLANE_B32 %5.sub2, implicit $exec
+    ; CHECK: %9.sub3:sgpr_128 = V_READFIRSTLANE_B32 %4.sub3, implicit $exec
+    ; CHECK: S_BARRIER
+    ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %9, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK: undef %12.sub0:sgpr_128 = V_READFIRSTLANE_B32 %3.sub0, implicit $exec
+    ; CHECK: %12.sub1:sgpr_128 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec
+    ; CHECK: %12.sub2:sgpr_128 = V_READFIRSTLANE_B32 %1.sub2, implicit $exec
+    ; CHECK: %12.sub3:sgpr_128 = V_READFIRSTLANE_B32 %0.sub3, implicit $exec
+    ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %12, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[BUFFER_LOAD_DWORD_OFFSET]], [[BUFFER_LOAD_DWORD_OFFSET]], implicit $exec
+    ; CHECK: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[BUFFER_LOAD_DWORD_OFFSET1]], [[BUFFER_LOAD_DWORD_OFFSET1]], implicit $exec
+    ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MUL_LO_U32_e64_]], [[V_MUL_LO_U32_e64_1]], implicit $exec
+    ; CHECK: GLOBAL_STORE_DWORD %8, [[V_ADD_U32_e32_]], 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK: S_ENDPGM 0
     undef %43.sub3:vreg_128 = COPY $vgpr9
     undef %42.sub2:vreg_128 = COPY $vgpr8
     undef %41.sub1:vreg_128 = COPY $vgpr7
@@ -30,17 +55,17 @@ body: |
     %33.sub1:sgpr_128 = V_READFIRSTLANE_B32 %44.sub1, implicit $exec
     %33.sub2:sgpr_128 = V_READFIRSTLANE_B32 %45.sub2, implicit $exec
     %33.sub3:sgpr_128 = V_READFIRSTLANE_B32 %46.sub3, implicit $exec
-    %15:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %33, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %15:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %33, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %39:vgpr_32 = V_MUL_LO_U32_e64 %15, %15, implicit $exec
 
     undef %27.sub0:sgpr_128 = V_READFIRSTLANE_B32 %26.sub0, implicit $exec
     %27.sub1:sgpr_128 = V_READFIRSTLANE_B32 %41.sub1, implicit $exec
     %27.sub2:sgpr_128 = V_READFIRSTLANE_B32 %42.sub2, implicit $exec
     %27.sub3:sgpr_128 = V_READFIRSTLANE_B32 %43.sub3, implicit $exec
-    %19:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %27, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %19:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %27, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %40:vgpr_32 = V_MUL_LO_U32_e64 %19, %19, implicit $exec
 
     %23:vgpr_32 = V_ADD_U32_e32 %39, %40, implicit $exec
-    GLOBAL_STORE_DWORD %38, %23, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %38, %23, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir b/llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir
index 2a202294e199..e2f3677e51a8 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir
@@ -37,11 +37,11 @@ body:             |
     %2 = COPY $sgpr30_sgpr31
     %1 = COPY $vgpr2_vgpr3
     %0 = COPY $vgpr0_vgpr1
-    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
     %12 = S_MOV_B32 123
     %10 = V_LSHRREV_B32_e64 16, %3, implicit $exec
     %11 = V_ADD_CO_U32_e32 %12, killed %10, implicit-def $vcc, implicit $exec
-    FLAT_STORE_DWORD %0, %11, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    FLAT_STORE_DWORD %0, %11, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     $sgpr30_sgpr31 = COPY %2
     S_SETPC_B64_return $sgpr30_sgpr31
 
@@ -80,9 +80,9 @@ body:             |
     %2 = COPY $sgpr30_sgpr31
     %1 = COPY $vgpr2_vgpr3
     %0 = COPY $vgpr0_vgpr1
-    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
     %10 = V_LSHRREV_B32_e64 16, %3, implicit $exec
     %11 = V_TRUNC_F32_e64 0, killed %10, 1, 2, implicit $mode, implicit $exec, implicit-def $vcc
-    FLAT_STORE_DWORD %0, %11, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    FLAT_STORE_DWORD %0, %11, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     $sgpr30_sgpr31 = COPY %2
     S_SETPC_B64_return $sgpr30_sgpr31

diff  --git a/llvm/test/CodeGen/AMDGPU/sdwa-ops.mir b/llvm/test/CodeGen/AMDGPU/sdwa-ops.mir
index 17d430cb3525..da9486a2ae04 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-ops.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-ops.mir
@@ -29,19 +29,19 @@ body:             |
     %63:vgpr_32, %65:sreg_64_xexec = nsw V_ADD_CO_U32_e64 %30.sub0, %23, 0, implicit $exec
     %64:vgpr_32, dead %66:sreg_64_xexec = nuw V_ADDC_U32_e64 %30.sub1, %0, killed %65, 0, implicit $exec
     %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1
-    GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+    GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
 
     %161:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec
     %163:vgpr_32, %165:sreg_64_xexec = V_ADD_CO_U32_e64 %30.sub0, %161, 0, implicit $exec
     %164:vgpr_32, dead %166:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %165, 0, implicit $exec
     %162:vreg_64 = REG_SEQUENCE %163, %subreg.sub0, %164, %subreg.sub1
-    GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %162, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+    GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %162, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
 
     %171:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec
     %173:vgpr_32, %175:sreg_64_xexec = V_ADD_CO_U32_e64 %30.sub0, %171, 0, implicit $exec
     %174:vgpr_32, dead %176:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %175, 0, implicit $exec
     %172:vreg_64 = REG_SEQUENCE %173, %subreg.sub0, %174, %subreg.sub1
-    GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %172, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+    GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %172, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
 
 ...
 
@@ -77,13 +77,13 @@ body:             |
 
     %64:vgpr_32, dead %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %65, 0, implicit $exec
     %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1
-    GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+    GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
 
     %161:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec
     %163:vgpr_32, %165:sreg_64_xexec = V_ADD_CO_U32_e64 %30.sub0, %161, 0, implicit $exec
     %164:vgpr_32, dead %166:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %165, 0, implicit $exec
     %162:vreg_64 = REG_SEQUENCE %163, %subreg.sub0, %164, %subreg.sub1
-    GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %162, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+    GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %162, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
 
 ...
 
@@ -113,7 +113,7 @@ body:             |
     %63:vgpr_32, %65:sreg_64_xexec = V_ADD_CO_U32_e64 %30.sub0, %23, 0, implicit $exec
     %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %65, 0, implicit $exec
     %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %66, %subreg.sub1
-    GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+    GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
 
 ...
 
@@ -143,7 +143,7 @@ body:             |
     %63:vgpr_32, %65:sreg_64_xexec = V_ADD_CO_U32_e64 %30.sub0, %23, 0, implicit $exec
     %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, 0, implicit $exec
     %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %65, %subreg.sub1
-    GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+    GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
 
 
 ...
@@ -172,7 +172,7 @@ body:             |
     %63:vgpr_32, %65:sreg_64_xexec = V_ADD_CO_U32_e64 %30.sub0, %23, 0, implicit $exec
     %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, 0, implicit $exec
     %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1
-    GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+    GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
 
 
 ...
@@ -201,7 +201,7 @@ body:             |
     %30:vreg_64 = COPY $sgpr0_sgpr1
     %63:vgpr_32, %65:sreg_64_xexec = V_ADD_CO_U32_e64 %30.sub0, %23, 0, implicit $exec
     %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %23, %subreg.sub1
-    GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+    GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
 
 
 ...
@@ -232,7 +232,7 @@ body:             |
     %30:vreg_64 = COPY $sgpr0_sgpr1
     %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %24, 0, implicit $exec
     %62:vreg_64 = REG_SEQUENCE %23, %subreg.sub0, %23, %subreg.sub1
-    GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+    GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
 
 
 ...
@@ -263,7 +263,7 @@ body:             |
     %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, 0, implicit $exec
     %31:vreg_64 = COPY $vcc
     %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1
-    GLOBAL_STORE_DWORDX2_SADDR %31.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+    GLOBAL_STORE_DWORDX2_SADDR %31.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
 
 
 ...
@@ -294,7 +294,8 @@ body:             |
     %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, 0, implicit $exec
     %31:vreg_64 = COPY $vcc
     %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1
-    GLOBAL_STORE_DWORDX2_SADDR %31.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+    GLOBAL_STORE_DWORDX2_SADDR %31.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+
 
 ...
 
@@ -325,7 +326,7 @@ body:             |
     %32:vreg_64 = REG_SEQUENCE %31, %subreg.sub0, %23, %subreg.sub1
     %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, 0, implicit $exec
     %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1
-    GLOBAL_STORE_DWORDX2_SADDR %32.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+    GLOBAL_STORE_DWORDX2_SADDR %32.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
 
 ...
 
@@ -356,7 +357,7 @@ body:             |
     %32:vreg_64 = REG_SEQUENCE %31, %subreg.sub0, %23, %subreg.sub1
     %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, 0, implicit $exec
     %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1
-    GLOBAL_STORE_DWORDX2_SADDR %32.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+    GLOBAL_STORE_DWORDX2_SADDR %32.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
 
 ...
 
@@ -386,5 +387,5 @@ body:             |
     %31:vreg_64 = COPY killed $vcc
     %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, 0, implicit $exec
     %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1
-    GLOBAL_STORE_DWORDX2_SADDR %31.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
+    GLOBAL_STORE_DWORDX2_SADDR %31.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
 

diff  --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir
index 55921cfed6bb..e6181a9cd49a 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir
@@ -80,7 +80,7 @@ body:             |
     %2 = COPY $sgpr30_sgpr31
     %1 = COPY $vgpr2_vgpr3
     %0 = COPY $vgpr0_vgpr1
-    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
 
     %5 = S_MOV_B32 65535
     %6 = S_MOV_B32 65535
@@ -130,7 +130,7 @@ body:             |
 
     %100 = V_MOV_B32_e32 %48, implicit $exec
 
-    FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     $sgpr30_sgpr31 = COPY %2
     S_SETPC_B64_return $sgpr30_sgpr31
 
@@ -227,7 +227,7 @@ body:             |
     %2 = COPY $sgpr30_sgpr31
     %1 = COPY $vgpr2_vgpr3
     %0 = COPY $vgpr0_vgpr1
-    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
 
     %5 = S_MOV_B32 65535
     %6 = S_MOV_B32 65535
@@ -286,7 +286,7 @@ body:             |
 
     %100 = V_MOV_B32_e32 %60, implicit $exec
 
-    FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     $sgpr30_sgpr31 = COPY %2
     S_SETPC_B64_return $sgpr30_sgpr31
 

diff  --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
index 322e640644e4..3b4688010aea 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
@@ -89,7 +89,7 @@ body:             |
     %2 = COPY $sgpr30_sgpr31
     %1 = COPY $vgpr2_vgpr3
     %0 = COPY $vgpr0_vgpr1
-    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
 
     %5 = S_MOV_B32 65535
     %6 = S_MOV_B32 65535
@@ -139,7 +139,7 @@ body:             |
 
     %100 = V_MOV_B32_e32 %48, implicit $exec
 
-    FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     $sgpr30_sgpr31 = COPY %2
     S_SETPC_B64_return $sgpr30_sgpr31
 
@@ -256,7 +256,7 @@ body:             |
     %2 = COPY $sgpr30_sgpr31
     %1 = COPY $vgpr2_vgpr3
     %0 = COPY $vgpr0_vgpr1
-    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
 
     %5 = S_MOV_B32 65535
     %6 = S_MOV_B32 65535
@@ -315,7 +315,7 @@ body:             |
 
     %100 = V_MOV_B32_e32 %60, implicit $exec
 
-    FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     $sgpr30_sgpr31 = COPY %2
     S_SETPC_B64_return $sgpr30_sgpr31
 
@@ -400,7 +400,7 @@ body:             |
     %2 = COPY $sgpr30_sgpr31
     %1 = COPY $vgpr2_vgpr3
     %0 = COPY $vgpr0_vgpr1
-    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
 
     %5 = S_MOV_B32 65535
     %6 = S_MOV_B32 65535
@@ -441,7 +441,7 @@ body:             |
 
     %100 = V_MOV_B32_e32 $vcc_lo, implicit $exec
 
-    FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     $sgpr30_sgpr31 = COPY %2
     S_SETPC_B64_return $sgpr30_sgpr31
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
index 905e3ec260b3..7229b8a118b6 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
@@ -36,8 +36,8 @@ body:             |
     %2 = COPY $sgpr30_sgpr31
     %1 = COPY $vgpr2_vgpr3
     %0 = COPY $vgpr0_vgpr1
-    %3 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
-    %4 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    %3 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    %4 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
 
     %5 = V_AND_B32_e32 65535, %3, implicit $exec
     %6 = V_LSHRREV_B32_e64 16, %4, implicit $exec
@@ -51,7 +51,7 @@ body:             |
 
     %13 = V_OR_B32_e64 %10, %12, implicit $exec
 
-    FLAT_STORE_DWORD %0, %13, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    FLAT_STORE_DWORD %0, %13, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     $sgpr30_sgpr31 = COPY %2
     S_SETPC_B64_return $sgpr30_sgpr31
 
@@ -88,14 +88,14 @@ body:             |
     %2 = COPY $sgpr30_sgpr31
     %1 = COPY $vgpr2_vgpr3
     %0 = COPY $vgpr0_vgpr1
-    %3 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
-    %4 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    %3 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    %4 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
 
     %9:vgpr_32 = V_LSHRREV_B16_e64 8, %3, implicit $exec
     %10:sreg_32_xm0 = S_MOV_B32 255
     %11:vgpr_32 = V_AND_B32_e64 %3, killed %10, implicit $exec
     %17:vgpr_32 = V_MOV_B32_sdwa 0, %4, 0, 5, 2, 4, implicit $exec, implicit %11(tied-def 0)
-    FLAT_STORE_DWORD %0, %17, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    FLAT_STORE_DWORD %0, %17, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     S_ENDPGM 0
 
 ...
@@ -131,14 +131,14 @@ body:             |
     %2 = COPY $sgpr30_sgpr31
     %1 = COPY $vgpr2_vgpr3
     %0 = COPY $vgpr0_vgpr1
-    %3 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
-    %4 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    %3 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    %4 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
 
     %9:vgpr_32 = V_LSHRREV_B16_e64 8, %3, implicit $exec
     %10:sreg_32_xm0 = S_MOV_B32 65535
     %11:vgpr_32 = V_AND_B32_e64 %3, killed %10, implicit $exec
     %17:vgpr_32 = V_MOV_B32_sdwa 0, %4, 0, 5, 2, 4, implicit $exec, implicit %11(tied-def 0)
-    FLAT_STORE_DWORD %0, %17, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    FLAT_STORE_DWORD %0, %17, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir b/llvm/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir
index 4150339a9418..a1d8717e24ae 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir
@@ -221,26 +221,26 @@ body:             |
     %15 = S_ADDC_U32 %7.sub1, %0.sub1, implicit-def dead $scc, implicit $scc
     %16 = REG_SEQUENCE %14, %subreg.sub0, %15, %subreg.sub1
     %18 = COPY %16
-    %17 = FLAT_LOAD_DWORD %18, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.uglygep45)
+    %17 = FLAT_LOAD_DWORD %18, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.uglygep45)
     %60 = V_BFE_U32_e64 %17, 8, 8, implicit $exec
     %61 = V_LSHLREV_B32_e32 2, killed %60, implicit $exec
     %70 = V_ADD_CO_U32_e32 %7.sub0, %61, implicit-def $vcc, implicit $exec
     %66 = COPY %13
     %65 = V_ADDC_U32_e32 0, %66, implicit-def $vcc, implicit $vcc, implicit $exec
     %67 = REG_SEQUENCE %70, %subreg.sub0, killed %65, %subreg.sub1
-    FLAT_STORE_DWORD %67, %30, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp9)
+    FLAT_STORE_DWORD %67, %30, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp9)
     %37 = S_ADD_U32 %14, 4, implicit-def $scc
     %38 = S_ADDC_U32 %15, 0, implicit-def dead $scc, implicit $scc
     %71 = COPY killed %37
     %72 = COPY killed %38
     %41 = REG_SEQUENCE killed %71, %subreg.sub0, killed %72, %subreg.sub1
-    %40 = FLAT_LOAD_DWORD killed %41, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.scevgep)
+    %40 = FLAT_LOAD_DWORD killed %41, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.scevgep)
     %73 = V_BFE_U32_e64 %40, 8, 8, implicit $exec
     %74 = V_LSHLREV_B32_e32 2, killed %73, implicit $exec
     %83 = V_ADD_CO_U32_e32 %7.sub0, %74, implicit-def $vcc, implicit $exec
     %78 = V_ADDC_U32_e32 0, %66, implicit-def $vcc, implicit $vcc, implicit $exec
     %80 = REG_SEQUENCE %83, %subreg.sub0, killed %78, %subreg.sub1
-    FLAT_STORE_DWORD %80, %30, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp17)
+    FLAT_STORE_DWORD %80, %30, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp17)
     %55 = S_ADD_U32 %0.sub0, 8, implicit-def $scc
     %56 = S_ADDC_U32 %0.sub1, 0, implicit-def dead $scc, implicit $scc
     %57 = REG_SEQUENCE %55, %subreg.sub0, killed %56, %subreg.sub1
@@ -384,26 +384,26 @@ body:             |
     %15 = S_ADDC_U32 %7.sub1, %0.sub1, implicit-def dead $scc, implicit $scc
     %16 = REG_SEQUENCE %14, %subreg.sub0, %15, %subreg.sub1
     %18 = COPY %16
-    %17 = FLAT_LOAD_DWORD %18, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.uglygep45)
+    %17 = FLAT_LOAD_DWORD %18, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.uglygep45)
     %60 = V_BFE_U32_e64 %17, 8, 8, implicit $exec
     %61 = V_LSHLREV_B32_e32 %84, killed %60, implicit $exec
     %70 = V_ADD_CO_U32_e32 %7.sub0, %61, implicit-def $vcc, implicit $exec
     %66 = COPY %13
     %65 = V_ADDC_U32_e32 0, %66, implicit-def $vcc, implicit $vcc, implicit $exec
     %67 = REG_SEQUENCE %70, %subreg.sub0, killed %65, %subreg.sub1
-    FLAT_STORE_DWORD %67, %30, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp9)
+    FLAT_STORE_DWORD %67, %30, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp9)
     %37 = S_ADD_U32 %14, 4, implicit-def $scc
     %38 = S_ADDC_U32 %15, 0, implicit-def dead $scc, implicit $scc
     %71 = COPY killed %37
     %72 = COPY killed %38
     %41 = REG_SEQUENCE killed %71, %subreg.sub0, killed %72, %subreg.sub1
-    %40 = FLAT_LOAD_DWORD killed %41, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.scevgep)
+    %40 = FLAT_LOAD_DWORD killed %41, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.scevgep)
     %73 = V_BFE_U32_e64 %40, 8, 8, implicit $exec
     %74 = V_LSHLREV_B32_e32 %84, killed %73, implicit $exec
     %83 = V_ADD_CO_U32_e32 %7.sub0, %74, implicit-def $vcc, implicit $exec
     %78 = V_ADDC_U32_e32 0, %66, implicit-def $vcc, implicit $vcc, implicit $exec
     %80 = REG_SEQUENCE %83, %subreg.sub0, killed %78, %subreg.sub1
-    FLAT_STORE_DWORD %80, %30, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp17)
+    FLAT_STORE_DWORD %80, %30, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp17)
     %55 = S_ADD_U32 %0.sub0, 8, implicit-def $scc
     %56 = S_ADDC_U32 %0.sub1, 0, implicit-def dead $scc, implicit $scc
     %57 = REG_SEQUENCE %55, %subreg.sub0, killed %56, %subreg.sub1

diff  --git a/llvm/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir b/llvm/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir
index 973d0b088b44..8e46fb262ac1 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir
@@ -41,7 +41,7 @@ body:             |
     %2 = COPY $sgpr30_sgpr31
     %1 = COPY $vgpr2_vgpr3
     %0 = COPY $vgpr0_vgpr1
-    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
 
     %12 = V_LSHRREV_B32_e64 16, %3, implicit $exec
     %13 = V_BCNT_U32_B32_e64 %3, killed %12, implicit-def $vcc, implicit $exec
@@ -56,6 +56,6 @@ body:             |
     %19 = V_READLANE_B32 killed %18, 0, implicit-def $vcc, implicit $exec
     %20 = V_MOV_B32_e64 %19, implicit $exec
 
-    FLAT_STORE_DWORD %0, %20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    FLAT_STORE_DWORD %0, %20, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     $sgpr30_sgpr31 = COPY %2
     S_SETPC_B64_return $sgpr30_sgpr31

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir
index 782227035759..2554e06569c3 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir
@@ -84,7 +84,7 @@ body:             |
   bb.0:
     %0:sreg_32_xm0 = COPY $sgpr32
     %1:vreg_64 = IMPLICIT_DEF
-    %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
     ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit undef $vgpr0

diff  --git a/llvm/test/CodeGen/AMDGPU/shrink-carry.mir b/llvm/test/CodeGen/AMDGPU/shrink-carry.mir
index b6542165a394..27c348ac581b 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-carry.mir
+++ b/llvm/test/CodeGen/AMDGPU/shrink-carry.mir
@@ -21,7 +21,7 @@ body:             |
     %2 = IMPLICIT_DEF
     %3 = V_CMP_GT_U32_e64 %0, %1, implicit $exec
     %4, %5 = V_SUBBREV_U32_e64 0, %0, %3, 0, implicit $exec
-    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %4, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %4, 0, 0, 0, 0, 0, implicit $exec
 
 ...
 
@@ -46,7 +46,7 @@ body:             |
     %2 = IMPLICIT_DEF
     %3 = V_CMP_GT_U32_e64 %0, %1, implicit $exec
     %4, %5 = V_SUBB_U32_e64 %0, 0, %3, 0, implicit $exec
-    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %4, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %4, 0, 0, 0, 0, 0, implicit $exec
 
 ...
 
@@ -71,7 +71,7 @@ body:             |
     %2 = IMPLICIT_DEF
     %3 = V_CMP_GT_U32_e64 %0, %1, implicit $exec
     %4, %5 = V_ADDC_U32_e64 0, %0, %3, 0, implicit $exec
-    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %4, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %4, 0, 0, 0, 0, 0, implicit $exec
 
 ...
 
@@ -96,6 +96,6 @@ body:             |
     %2 = IMPLICIT_DEF
     %3 = V_CMP_GT_U32_e64 %0, %1, implicit $exec
     %4, %5 = V_ADDC_U32_e64 %0, 0, %3, 0, implicit $exec
-    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %4, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %4, 0, 0, 0, 0, 0, implicit $exec
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir b/llvm/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
index dbcacb5c9a33..168392bb1750 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
+++ b/llvm/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
@@ -81,11 +81,11 @@ body:             |
     %13 = REG_SEQUENCE killed %5, 17, %12, 18
     %28 = V_LSHL_B64_e64 killed %27, 2, implicit $exec
     %16 = REG_SEQUENCE killed %4, 17, %12, 18
-    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec
     %29, %9 = V_ADD_CO_U32_e64 %19, %17, 0, implicit $exec
     %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %9, implicit $exec
-    BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -165,11 +165,11 @@ body:             |
     %13 = REG_SEQUENCE killed %5, 17, %12, 18
     %28 = V_LSHL_B64_e64 killed %27, 2, implicit $exec
     %16 = REG_SEQUENCE killed %4, 17, %12, 18
-    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec
     %29, %9 = V_SUB_CO_U32_e64 %19, %17, 0, implicit $exec
     %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %9, implicit $exec
-    BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -249,11 +249,11 @@ body:             |
     %13 = REG_SEQUENCE killed %5, 17, %12, 18
     %28 = V_LSHL_B64_e64 killed %27, 2, implicit $exec
     %16 = REG_SEQUENCE killed %4, 17, %12, 18
-    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec
     %29, %9 = V_SUBREV_CO_U32_e64 %19, %17, 0, implicit $exec
     %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %9, implicit $exec
-    BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -332,12 +332,12 @@ body:             |
     %13 = REG_SEQUENCE killed %5, 17, %12, 18
     %28 = V_LSHL_B64_e64 killed %27, 2, implicit $exec
     %16 = REG_SEQUENCE killed %4, 17, %12, 18
-    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec
     %9 = S_MOV_B64 0
     %29, $vcc = V_ADDC_U32_e64 %19, %17, %9, 0, implicit $exec
     %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $vcc, implicit $exec
-    BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -417,12 +417,12 @@ body:             |
     %13 = REG_SEQUENCE killed %5, 17, %12, 18
     %28 = V_LSHL_B64_e64 killed %27, 2, implicit $exec
     %16 = REG_SEQUENCE killed %4, 17, %12, 18
-    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec
     $vcc = S_MOV_B64 0
     %29, $vcc = V_ADDC_U32_e64 %19, %17, $vcc, 0, implicit $exec
     %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $vcc, implicit $exec
-    BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -502,11 +502,11 @@ body:             |
     %13 = REG_SEQUENCE killed %5, 17, %12, 18
     %28 = V_LSHL_B64_e64 killed %27, 2, implicit $exec
     %16 = REG_SEQUENCE killed %4, 17, %12, 18
-    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec
     %29, $vcc = V_ADDC_U32_e64 %19, %17, undef $vcc, 0, implicit $exec
     %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $vcc, implicit $exec
-    BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir b/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir
index 3daf2b88943f..ffcad230d0d0 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir
+++ b/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir
@@ -15,7 +15,7 @@ body:             |
   ; CHECK:   $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
   ; CHECK:   renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr0, killed $vgpr0, implicit-def $vcc, implicit $exec
   ; CHECK:   renamable $vgpr1 = V_ADDC_U32_e32 0, killed $vgpr1, implicit-def $vcc, implicit killed $vcc, implicit $exec
-  ; CHECK:   renamable $vgpr0 = FLAT_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
+  ; CHECK:   renamable $vgpr0 = FLAT_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
   ; CHECK:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0, 0 :: (dereferenceable invariant load 8, align 16, addrspace 4)
   ; CHECK:   S_WAITCNT 112
   ; CHECK:   V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
@@ -61,7 +61,7 @@ body:             |
     $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
     renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr0, killed $vgpr0, implicit-def $vcc, implicit $exec
     renamable $vgpr1 = V_ADDC_U32_e32 0, killed $vgpr1, implicit-def $vcc, implicit killed $vcc, implicit $exec
-    renamable $vgpr0 = FLAT_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
+    renamable $vgpr0 = FLAT_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
     renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0, 0 :: (dereferenceable invariant load 8, align 16, addrspace 4)
     S_WAITCNT 112
     V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-agpr-partially-undef.mir b/llvm/test/CodeGen/AMDGPU/spill-agpr-partially-undef.mir
index f41cb8b50be9..454d653dcb2e 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr-partially-undef.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr-partially-undef.mir
@@ -17,9 +17,9 @@ body:             |
     ; CHECK-LABEL: name: spill_a64_kill
     ; CHECK: liveins: $agpr0_agpr1
     ; CHECK: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1
-    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5)
+    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5)
     ; CHECK: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
-    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
     SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5)
 ...
 
@@ -41,9 +41,9 @@ body:             |
     ; CHECK-LABEL: name: spill_a64_undef_sub1_killed
     ; CHECK: liveins: $agpr0
     ; CHECK: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1
-    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5)
+    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5)
     ; CHECK: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
-    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
     SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5)
 ...
 
@@ -63,8 +63,8 @@ body:             |
     ; CHECK-LABEL: name: spill_a64_undef_sub0_killed
     ; CHECK: liveins: $agpr1
     ; CHECK: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1
-    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5)
+    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5)
     ; CHECK: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
-    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
     SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5)
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index 511d02a104b3..91965f621efb 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -1,15 +1,23 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,A2V %s
-; RUN: llc -march=amdgcn -mcpu=gfx908 -amdgpu-spill-vgpr-to-agpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,A2M %s
+; RUN: llc -march=amdgcn -mcpu=gfx908 -amdgpu-spill-vgpr-to-agpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908-A2M,A2M %s
+; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-spill-vgpr-to-agpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX90A-A2M,A2M %s
 
 ; GCN-LABEL: {{^}}max_24regs_32a_used:
-; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
-; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-; A2V-NOT:    SCRATCH_RSRC
-; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse
-; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
-; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
-; GFX908:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse
-; A2V:        ScratchSize: 0
+; A2M-DAG:        s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; A2M-DAG:        s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GCN-DAG:        v_mfma_f32_16x16x1f32
+; GCN-DAG:        v_mfma_f32_16x16x1f32
+; A2V-NOT:        SCRATCH_RSRC
+; GFX908-A2M-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse
+; A2V:            v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse
+; GFX908-A2M:     buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
+; GFX908-A2M:     buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
+; GFX90A-NOT:     v_accvgpr_read_b32
+; GFX90A-A2M:     buffer_store_dword a{{[0-9]+}}, off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
+; GFX90A-A2M:     buffer_load_dword a{{[0-9]+}}, off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
+; GFX908:         v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse
+; GFX90A-NOT:     v_accvgpr_write_b32
+; A2V:            ScratchSize: 0
 define amdgpu_kernel void @max_24regs_32a_used(<16 x float> addrspace(1)* %arg, float addrspace(1)* %out) #0 {
 bb:
   %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
@@ -35,8 +43,10 @@ bb:
 ; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
 ; A2V-NOT:    SCRATCH_RSRC
 ; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} ; Reload Reuse
-; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
-; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
+; GFX908-A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
+; GFX908-A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
+; GFX90A-A2M: buffer_store_dword a{{[0-9]+}}, off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
+; GFX90A-A2M: buffer_load_dword a{{[0-9]+}}, off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
 ; A2V:        v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse
 ; A2V:        ScratchSize: 0
 define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, <4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 {
@@ -60,17 +70,17 @@ st:
 }
 
 ; GCN-LABEL: {{^}}max_10_vgprs_used_9a:
-; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
-; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GFX908-A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GFX908-A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
 ; A2V-NOT:    SCRATCH_RSRC
 
 ; A2V: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} ; Reload Reuse
 ; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse
 ; A2V: ScratchSize: 0
 
-; A2M: buffer_store_dword v[[VSPILLSTORE:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
-; A2M: buffer_load_dword v[[VSPILL_RELOAD:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
-; A2M: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL_RELOAD]] ; Reload Reuse
+; GFX908-A2M: buffer_store_dword v[[VSPILLSTORE:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
+; GFX908-A2M: buffer_load_dword v[[VSPILL_RELOAD:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
+; GFX908-A2M: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL_RELOAD]] ; Reload Reuse
 define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
   %a1 = call <4 x i32> asm sideeffect "", "=a"()
   %a2 = call <4 x i32> asm sideeffect "", "=a"()
@@ -86,9 +96,12 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
 ; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
 ; A2V-NOT:    SCRATCH_RSRC
 ; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse
-; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
-; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
+; GFX908-A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
+; GFX90A-NOT: v_accvgpr_read_b32
+; GFX90A:     v_mfma_f32_32x32x1f32
+; GFX908-A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
 ; GFX908:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse
+; GFX90A-NOT: v_accvgpr_write_b32
 ; A2V:        ScratchSize: 0
 define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 {
 bb:

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.mir b/llvm/test/CodeGen/AMDGPU/spill-agpr.mir
index 10c960eee2f8..6c2cc15c6dfb 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.mir
@@ -1,6 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=regallocfast -o - %s | FileCheck -check-prefix=SPILLED %s
-# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=regallocfast,si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=EXPANDED %s
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=regallocfast -o - %s | FileCheck -check-prefix=GFX908-SPILLED %s
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=regallocfast,si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=GFX908-EXPANDED %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=regallocfast -o - %s | FileCheck -check-prefix=GFX90A-SPILLED %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=regallocfast,si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=GFX90A-EXPANDED %s
 
 ---
 name: spill_restore_agpr32
@@ -9,6 +11,72 @@ machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
   stackPtrOffsetReg: $sgpr32
 body: |
+  ; GFX908-SPILLED-LABEL: name: spill_restore_agpr32
+  ; GFX908-SPILLED: bb.0:
+  ; GFX908-SPILLED:   successors: %bb.1(0x80000000)
+  ; GFX908-SPILLED:   S_NOP 0, implicit-def renamable $agpr0
+  ; GFX908-SPILLED:   SI_SPILL_A32_SAVE killed $agpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5)
+  ; GFX908-SPILLED:   S_NOP 0, implicit-def renamable $agpr0
+  ; GFX908-SPILLED:   SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
+  ; GFX908-SPILLED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX908-SPILLED: bb.1:
+  ; GFX908-SPILLED:   successors: %bb.2(0x80000000)
+  ; GFX908-SPILLED:   S_NOP 1
+  ; GFX908-SPILLED: bb.2:
+  ; GFX908-SPILLED:   $agpr0 = SI_SPILL_A32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load 4 from %stack.1, addrspace 5)
+  ; GFX908-SPILLED:   $agpr1 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+  ; GFX908-SPILLED:   S_NOP 0, implicit killed renamable $agpr0, implicit killed renamable $agpr1
+  ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr32
+  ; GFX908-EXPANDED: bb.0:
+  ; GFX908-EXPANDED:   successors: %bb.1(0x80000000)
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1
+  ; GFX908-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0
+  ; GFX908-EXPANDED:   $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
+  ; GFX908-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0
+  ; GFX908-EXPANDED:   $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
+  ; GFX908-EXPANDED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX908-EXPANDED: bb.1:
+  ; GFX908-EXPANDED:   successors: %bb.2(0x80000000)
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1
+  ; GFX908-EXPANDED:   S_NOP 1
+  ; GFX908-EXPANDED: bb.2:
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1
+  ; GFX908-EXPANDED:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+  ; GFX908-EXPANDED:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
+  ; GFX908-EXPANDED:   S_NOP 0, implicit killed renamable $agpr0, implicit killed renamable $agpr1
+  ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr32
+  ; GFX90A-SPILLED: bb.0:
+  ; GFX90A-SPILLED:   successors: %bb.1(0x80000000)
+  ; GFX90A-SPILLED:   S_NOP 0, implicit-def renamable $agpr0
+  ; GFX90A-SPILLED:   SI_SPILL_A32_SAVE killed $agpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5)
+  ; GFX90A-SPILLED:   S_NOP 0, implicit-def renamable $agpr0
+  ; GFX90A-SPILLED:   SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
+  ; GFX90A-SPILLED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX90A-SPILLED: bb.1:
+  ; GFX90A-SPILLED:   successors: %bb.2(0x80000000)
+  ; GFX90A-SPILLED:   S_NOP 1
+  ; GFX90A-SPILLED: bb.2:
+  ; GFX90A-SPILLED:   $agpr0 = SI_SPILL_A32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load 4 from %stack.1, addrspace 5)
+  ; GFX90A-SPILLED:   $agpr1 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+  ; GFX90A-SPILLED:   S_NOP 0, implicit killed renamable $agpr0, implicit killed renamable $agpr1
+  ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr32
+  ; GFX90A-EXPANDED: bb.0:
+  ; GFX90A-EXPANDED:   successors: %bb.1(0x80000000)
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0
+  ; GFX90A-EXPANDED:   $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0
+  ; GFX90A-EXPANDED:   $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
+  ; GFX90A-EXPANDED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX90A-EXPANDED: bb.1:
+  ; GFX90A-EXPANDED:   successors: %bb.2(0x80000000)
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1
+  ; GFX90A-EXPANDED:   S_NOP 1
+  ; GFX90A-EXPANDED: bb.2:
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1
+  ; GFX90A-EXPANDED:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+  ; GFX90A-EXPANDED:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit killed renamable $agpr0, implicit killed renamable $agpr1
   ; SPILLED-LABEL: name: spill_restore_agpr32
   ; SPILLED: bb.0:
   ; SPILLED:   successors: %bb.1(0x80000000)
@@ -61,6 +129,64 @@ machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
   stackPtrOffsetReg: $sgpr32
 body: |
+  ; GFX908-SPILLED-LABEL: name: spill_restore_agpr64
+  ; GFX908-SPILLED: bb.0:
+  ; GFX908-SPILLED:   successors: %bb.1(0x80000000)
+  ; GFX908-SPILLED:   S_NOP 0, implicit-def renamable $agpr0_agpr1
+  ; GFX908-SPILLED:   SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX908-SPILLED: bb.1:
+  ; GFX908-SPILLED:   successors: %bb.2(0x80000000)
+  ; GFX908-SPILLED:   S_NOP 1
+  ; GFX908-SPILLED: bb.2:
+  ; GFX908-SPILLED:   $agpr0_agpr1 = SI_SPILL_A64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED:   S_NOP 0, implicit killed renamable $agpr0_agpr1
+  ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr64
+  ; GFX908-EXPANDED: bb.0:
+  ; GFX908-EXPANDED:   successors: %bb.1(0x80000000)
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1
+  ; GFX908-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0_agpr1
+  ; GFX908-EXPANDED:   $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $agpr0_agpr1
+  ; GFX908-EXPANDED:   $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit killed $agpr0_agpr1
+  ; GFX908-EXPANDED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX908-EXPANDED: bb.1:
+  ; GFX908-EXPANDED:   successors: %bb.2(0x80000000)
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1
+  ; GFX908-EXPANDED:   S_NOP 1
+  ; GFX908-EXPANDED: bb.2:
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1
+  ; GFX908-EXPANDED:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1
+  ; GFX908-EXPANDED:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1
+  ; GFX908-EXPANDED:   S_NOP 0, implicit killed renamable $agpr0_agpr1
+  ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr64
+  ; GFX90A-SPILLED: bb.0:
+  ; GFX90A-SPILLED:   successors: %bb.1(0x80000000)
+  ; GFX90A-SPILLED:   S_NOP 0, implicit-def renamable $agpr0_agpr1
+  ; GFX90A-SPILLED:   SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX90A-SPILLED: bb.1:
+  ; GFX90A-SPILLED:   successors: %bb.2(0x80000000)
+  ; GFX90A-SPILLED:   S_NOP 1
+  ; GFX90A-SPILLED: bb.2:
+  ; GFX90A-SPILLED:   $agpr0_agpr1 = SI_SPILL_A64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED:   S_NOP 0, implicit killed renamable $agpr0_agpr1
+  ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr64
+  ; GFX90A-EXPANDED: bb.0:
+  ; GFX90A-EXPANDED:   successors: %bb.1(0x80000000)
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0_agpr1
+  ; GFX90A-EXPANDED:   $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $agpr0_agpr1
+  ; GFX90A-EXPANDED:   $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit killed $agpr0_agpr1
+  ; GFX90A-EXPANDED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX90A-EXPANDED: bb.1:
+  ; GFX90A-EXPANDED:   successors: %bb.2(0x80000000)
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1
+  ; GFX90A-EXPANDED:   S_NOP 1
+  ; GFX90A-EXPANDED: bb.2:
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1
+  ; GFX90A-EXPANDED:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1
+  ; GFX90A-EXPANDED:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit killed renamable $agpr0_agpr1
   ; SPILLED-LABEL: name: spill_restore_agpr64
   ; SPILLED: bb.0:
   ; SPILLED:   successors: %bb.1(0x80000000)
@@ -108,6 +234,122 @@ machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
   stackPtrOffsetReg: $sgpr32
 body: |
+  ; GFX908-SPILLED-LABEL: name: spill_restore_agpr32_used_all_vgprs
+  ; GFX908-SPILLED: bb.0:
+  ; GFX908-SPILLED:   successors: %bb.1(0x80000000)
+  ; GFX908-SPILLED:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255
+  ; GFX908-SPILLED:   S_NOP 0, implicit-def renamable $agpr0
+  ; GFX908-SPILLED:   SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
+  ; GFX908-SPILLED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX908-SPILLED: bb.1:
+  ; GFX908-SPILLED:   successors: %bb.2(0x80000000)
+  ; GFX908-SPILLED: bb.2:
+  ; GFX908-SPILLED:   $agpr0 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+  ; GFX908-SPILLED:   S_NOP 0, implicit undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+  ; GFX908-SPILLED:   S_NOP 0, implicit undef $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+  ; GFX908-SPILLED:   S_NOP 0, implicit undef $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+  ; GFX908-SPILLED:   S_NOP 0, implicit undef $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
+  ; GFX908-SPILLED:   S_NOP 0, implicit undef $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
+  ; GFX908-SPILLED:   S_NOP 0, implicit undef $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
+  ; GFX908-SPILLED:   S_NOP 0, implicit undef $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
+  ; GFX908-SPILLED:   S_NOP 0, implicit undef $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143
+  ; GFX908-SPILLED:   S_NOP 0, implicit undef $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159
+  ; GFX908-SPILLED:   S_NOP 0, implicit undef $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175
+  ; GFX908-SPILLED:   S_NOP 0, implicit undef $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191
+  ; GFX908-SPILLED:   S_NOP 0, implicit undef $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207
+  ; GFX908-SPILLED:   S_NOP 0, implicit undef $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223
+  ; GFX908-SPILLED:   S_NOP 0, implicit undef $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-SPILLED:   S_NOP 0, implicit undef $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247
+  ; GFX908-SPILLED:   S_NOP 0, implicit undef $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255
+  ; GFX908-SPILLED:   S_NOP 0, implicit killed renamable $agpr0
+  ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr32_used_all_vgprs
+  ; GFX908-EXPANDED: bb.0:
+  ; GFX908-EXPANDED:   successors: %bb.1(0x80000000)
+  ; GFX908-EXPANDED:   liveins: $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0
+  ; GFX908-EXPANDED:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5)
+  ; GFX908-EXPANDED:   $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
+  ; GFX908-EXPANDED:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
+  ; GFX908-EXPANDED:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.1, addrspace 5)
+  ; GFX908-EXPANDED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX908-EXPANDED: bb.1:
+  ; GFX908-EXPANDED:   successors: %bb.2(0x80000000)
+  ; GFX908-EXPANDED: bb.2:
+  ; GFX908-EXPANDED:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+  ; GFX908-EXPANDED:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+  ; GFX908-EXPANDED:   S_NOP 0, implicit undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+  ; GFX908-EXPANDED:   S_NOP 0, implicit undef $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+  ; GFX908-EXPANDED:   S_NOP 0, implicit undef $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+  ; GFX908-EXPANDED:   S_NOP 0, implicit undef $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
+  ; GFX908-EXPANDED:   S_NOP 0, implicit undef $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
+  ; GFX908-EXPANDED:   S_NOP 0, implicit undef $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
+  ; GFX908-EXPANDED:   S_NOP 0, implicit undef $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
+  ; GFX908-EXPANDED:   S_NOP 0, implicit undef $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143
+  ; GFX908-EXPANDED:   S_NOP 0, implicit undef $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159
+  ; GFX908-EXPANDED:   S_NOP 0, implicit undef $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175
+  ; GFX908-EXPANDED:   S_NOP 0, implicit undef $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191
+  ; GFX908-EXPANDED:   S_NOP 0, implicit undef $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207
+  ; GFX908-EXPANDED:   S_NOP 0, implicit undef $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223
+  ; GFX908-EXPANDED:   S_NOP 0, implicit undef $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX908-EXPANDED:   S_NOP 0, implicit undef $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247
+  ; GFX908-EXPANDED:   S_NOP 0, implicit undef $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255
+  ; GFX908-EXPANDED:   S_NOP 0, implicit killed renamable $agpr0
+  ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr32_used_all_vgprs
+  ; GFX90A-SPILLED: bb.0:
+  ; GFX90A-SPILLED:   successors: %bb.1(0x80000000)
+  ; GFX90A-SPILLED:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255
+  ; GFX90A-SPILLED:   S_NOP 0, implicit-def renamable $agpr0
+  ; GFX90A-SPILLED:   SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
+  ; GFX90A-SPILLED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX90A-SPILLED: bb.1:
+  ; GFX90A-SPILLED:   successors: %bb.2(0x80000000)
+  ; GFX90A-SPILLED: bb.2:
+  ; GFX90A-SPILLED:   $agpr0 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+  ; GFX90A-SPILLED:   S_NOP 0, implicit undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+  ; GFX90A-SPILLED:   S_NOP 0, implicit undef $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+  ; GFX90A-SPILLED:   S_NOP 0, implicit undef $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+  ; GFX90A-SPILLED:   S_NOP 0, implicit undef $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
+  ; GFX90A-SPILLED:   S_NOP 0, implicit undef $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
+  ; GFX90A-SPILLED:   S_NOP 0, implicit undef $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
+  ; GFX90A-SPILLED:   S_NOP 0, implicit undef $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
+  ; GFX90A-SPILLED:   S_NOP 0, implicit undef $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143
+  ; GFX90A-SPILLED:   S_NOP 0, implicit undef $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159
+  ; GFX90A-SPILLED:   S_NOP 0, implicit undef $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175
+  ; GFX90A-SPILLED:   S_NOP 0, implicit undef $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191
+  ; GFX90A-SPILLED:   S_NOP 0, implicit undef $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207
+  ; GFX90A-SPILLED:   S_NOP 0, implicit undef $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223
+  ; GFX90A-SPILLED:   S_NOP 0, implicit undef $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX90A-SPILLED:   S_NOP 0, implicit undef $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247
+  ; GFX90A-SPILLED:   S_NOP 0, implicit undef $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255
+  ; GFX90A-SPILLED:   S_NOP 0, implicit killed renamable $agpr0
+  ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr32_used_all_vgprs
+  ; GFX90A-EXPANDED: bb.0:
+  ; GFX90A-EXPANDED:   successors: %bb.1(0x80000000)
+  ; GFX90A-EXPANDED:   liveins: $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0
+  ; GFX90A-EXPANDED:   BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
+  ; GFX90A-EXPANDED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX90A-EXPANDED: bb.1:
+  ; GFX90A-EXPANDED:   successors: %bb.2(0x80000000)
+  ; GFX90A-EXPANDED: bb.2:
+  ; GFX90A-EXPANDED:   $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit undef $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit undef $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit undef $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit undef $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit undef $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit undef $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit undef $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit undef $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit undef $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit undef $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit undef $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit undef $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit undef $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit undef $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit undef $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit killed renamable $agpr0
   ; SPILLED-LABEL: name: spill_restore_agpr32_used_all_vgprs
   ; SPILLED: bb.0:
   ; SPILLED:   successors: %bb.1(0x80000000)
@@ -203,6 +445,68 @@ machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
   stackPtrOffsetReg: $sgpr32
 body: |
+  ; GFX908-SPILLED-LABEL: name: spill_restore_agpr96
+  ; GFX908-SPILLED: bb.0:
+  ; GFX908-SPILLED:   successors: %bb.1(0x80000000)
+  ; GFX908-SPILLED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2
+  ; GFX908-SPILLED:   SI_SPILL_A96_SAVE killed $agpr0_agpr1_agpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store 12 into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX908-SPILLED: bb.1:
+  ; GFX908-SPILLED:   successors: %bb.2(0x80000000)
+  ; GFX908-SPILLED:   S_NOP 1
+  ; GFX908-SPILLED: bb.2:
+  ; GFX908-SPILLED:   $agpr0_agpr1_agpr2 = SI_SPILL_A96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 12 from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2
+  ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr96
+  ; GFX908-EXPANDED: bb.0:
+  ; GFX908-EXPANDED:   successors: %bb.1(0x80000000)
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX908-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2
+  ; GFX908-EXPANDED:   $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr0_agpr1_agpr2
+  ; GFX908-EXPANDED:   $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2
+  ; GFX908-EXPANDED:   $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2
+  ; GFX908-EXPANDED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX908-EXPANDED: bb.1:
+  ; GFX908-EXPANDED:   successors: %bb.2(0x80000000)
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX908-EXPANDED:   S_NOP 1
+  ; GFX908-EXPANDED: bb.2:
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX908-EXPANDED:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
+  ; GFX908-EXPANDED:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2
+  ; GFX908-EXPANDED:   $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2
+  ; GFX908-EXPANDED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2
+  ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr96
+  ; GFX90A-SPILLED: bb.0:
+  ; GFX90A-SPILLED:   successors: %bb.1(0x80000000)
+  ; GFX90A-SPILLED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2
+  ; GFX90A-SPILLED:   SI_SPILL_A96_SAVE killed $agpr0_agpr1_agpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store 12 into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX90A-SPILLED: bb.1:
+  ; GFX90A-SPILLED:   successors: %bb.2(0x80000000)
+  ; GFX90A-SPILLED:   S_NOP 1
+  ; GFX90A-SPILLED: bb.2:
+  ; GFX90A-SPILLED:   $agpr0_agpr1_agpr2 = SI_SPILL_A96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 12 from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2
+  ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr96
+  ; GFX90A-EXPANDED: bb.0:
+  ; GFX90A-EXPANDED:   successors: %bb.1(0x80000000)
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2
+  ; GFX90A-EXPANDED:   $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr0_agpr1_agpr2
+  ; GFX90A-EXPANDED:   $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2
+  ; GFX90A-EXPANDED:   $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2
+  ; GFX90A-EXPANDED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX90A-EXPANDED: bb.1:
+  ; GFX90A-EXPANDED:   successors: %bb.2(0x80000000)
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX90A-EXPANDED:   S_NOP 1
+  ; GFX90A-EXPANDED: bb.2:
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX90A-EXPANDED:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
+  ; GFX90A-EXPANDED:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2
+  ; GFX90A-EXPANDED:   $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2
   ; SPILLED-LABEL: name: spill_restore_agpr96
   ; SPILLED: bb.0:
   ; SPILLED:   successors: %bb.1(0x80000000)
@@ -252,6 +556,72 @@ machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
   stackPtrOffsetReg: $sgpr32
 body: |
+  ; GFX908-SPILLED-LABEL: name: spill_restore_agpr128
+  ; GFX908-SPILLED: bb.0:
+  ; GFX908-SPILLED:   successors: %bb.1(0x80000000)
+  ; GFX908-SPILLED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3
+  ; GFX908-SPILLED:   SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX908-SPILLED: bb.1:
+  ; GFX908-SPILLED:   successors: %bb.2(0x80000000)
+  ; GFX908-SPILLED:   S_NOP 1
+  ; GFX908-SPILLED: bb.2:
+  ; GFX908-SPILLED:   $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 16 from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3
+  ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr128
+  ; GFX908-EXPANDED: bb.0:
+  ; GFX908-EXPANDED:   successors: %bb.1(0x80000000)
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX908-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3
+  ; GFX908-EXPANDED:   $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3
+  ; GFX908-EXPANDED:   $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+  ; GFX908-EXPANDED:   $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+  ; GFX908-EXPANDED:   $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3
+  ; GFX908-EXPANDED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX908-EXPANDED: bb.1:
+  ; GFX908-EXPANDED:   successors: %bb.2(0x80000000)
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX908-EXPANDED:   S_NOP 1
+  ; GFX908-EXPANDED: bb.2:
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX908-EXPANDED:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
+  ; GFX908-EXPANDED:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
+  ; GFX908-EXPANDED:   $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
+  ; GFX908-EXPANDED:   $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
+  ; GFX908-EXPANDED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3
+  ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr128
+  ; GFX90A-SPILLED: bb.0:
+  ; GFX90A-SPILLED:   successors: %bb.1(0x80000000)
+  ; GFX90A-SPILLED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3
+  ; GFX90A-SPILLED:   SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX90A-SPILLED: bb.1:
+  ; GFX90A-SPILLED:   successors: %bb.2(0x80000000)
+  ; GFX90A-SPILLED:   S_NOP 1
+  ; GFX90A-SPILLED: bb.2:
+  ; GFX90A-SPILLED:   $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 16 from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3
+  ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr128
+  ; GFX90A-EXPANDED: bb.0:
+  ; GFX90A-EXPANDED:   successors: %bb.1(0x80000000)
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3
+  ; GFX90A-EXPANDED:   $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3
+  ; GFX90A-EXPANDED:   $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+  ; GFX90A-EXPANDED:   $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+  ; GFX90A-EXPANDED:   $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3
+  ; GFX90A-EXPANDED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX90A-EXPANDED: bb.1:
+  ; GFX90A-EXPANDED:   successors: %bb.2(0x80000000)
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX90A-EXPANDED:   S_NOP 1
+  ; GFX90A-EXPANDED: bb.2:
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX90A-EXPANDED:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
+  ; GFX90A-EXPANDED:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
+  ; GFX90A-EXPANDED:   $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
+  ; GFX90A-EXPANDED:   $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3
   ; SPILLED-LABEL: name: spill_restore_agpr128
   ; SPILLED: bb.0:
   ; SPILLED:   successors: %bb.1(0x80000000)
@@ -303,6 +673,76 @@ machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
   stackPtrOffsetReg: $sgpr32
 body: |
+  ; GFX908-SPILLED-LABEL: name: spill_restore_agpr160
+  ; GFX908-SPILLED: bb.0:
+  ; GFX908-SPILLED:   successors: %bb.1(0x80000000)
+  ; GFX908-SPILLED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX908-SPILLED:   SI_SPILL_A160_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store 20 into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX908-SPILLED: bb.1:
+  ; GFX908-SPILLED:   successors: %bb.2(0x80000000)
+  ; GFX908-SPILLED:   S_NOP 1
+  ; GFX908-SPILLED: bb.2:
+  ; GFX908-SPILLED:   $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_A160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 20 from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr160
+  ; GFX908-EXPANDED: bb.0:
+  ; GFX908-EXPANDED:   successors: %bb.1(0x80000000)
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX908-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX908-EXPANDED:   $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX908-EXPANDED:   $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX908-EXPANDED:   $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX908-EXPANDED:   $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX908-EXPANDED:   $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX908-EXPANDED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX908-EXPANDED: bb.1:
+  ; GFX908-EXPANDED:   successors: %bb.2(0x80000000)
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX908-EXPANDED:   S_NOP 1
+  ; GFX908-EXPANDED: bb.2:
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX908-EXPANDED:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX908-EXPANDED:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX908-EXPANDED:   $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX908-EXPANDED:   $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX908-EXPANDED:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX908-EXPANDED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr160
+  ; GFX90A-SPILLED: bb.0:
+  ; GFX90A-SPILLED:   successors: %bb.1(0x80000000)
+  ; GFX90A-SPILLED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX90A-SPILLED:   SI_SPILL_A160_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store 20 into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX90A-SPILLED: bb.1:
+  ; GFX90A-SPILLED:   successors: %bb.2(0x80000000)
+  ; GFX90A-SPILLED:   S_NOP 1
+  ; GFX90A-SPILLED: bb.2:
+  ; GFX90A-SPILLED:   $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_A160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 20 from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr160
+  ; GFX90A-EXPANDED: bb.0:
+  ; GFX90A-EXPANDED:   successors: %bb.1(0x80000000)
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX90A-EXPANDED:   $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX90A-EXPANDED:   $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX90A-EXPANDED:   $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX90A-EXPANDED:   $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX90A-EXPANDED:   $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX90A-EXPANDED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX90A-EXPANDED: bb.1:
+  ; GFX90A-EXPANDED:   successors: %bb.2(0x80000000)
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX90A-EXPANDED:   S_NOP 1
+  ; GFX90A-EXPANDED: bb.2:
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX90A-EXPANDED:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX90A-EXPANDED:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX90A-EXPANDED:   $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX90A-EXPANDED:   $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX90A-EXPANDED:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4
   ; SPILLED-LABEL: name: spill_restore_agpr160
   ; SPILLED: bb.0:
   ; SPILLED:   successors: %bb.1(0x80000000)
@@ -356,6 +796,80 @@ machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
   stackPtrOffsetReg: $sgpr32
 body: |
+  ; GFX908-SPILLED-LABEL: name: spill_restore_agpr192
+  ; GFX908-SPILLED: bb.0:
+  ; GFX908-SPILLED:   successors: %bb.1(0x80000000)
+  ; GFX908-SPILLED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX908-SPILLED:   SI_SPILL_A192_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store 24 into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX908-SPILLED: bb.1:
+  ; GFX908-SPILLED:   successors: %bb.2(0x80000000)
+  ; GFX908-SPILLED:   S_NOP 1
+  ; GFX908-SPILLED: bb.2:
+  ; GFX908-SPILLED:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_A192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 24 from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr192
+  ; GFX908-EXPANDED: bb.0:
+  ; GFX908-EXPANDED:   successors: %bb.1(0x80000000)
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX908-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX908-EXPANDED:   $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX908-EXPANDED:   $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX908-EXPANDED:   $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX908-EXPANDED:   $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX908-EXPANDED:   $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX908-EXPANDED:   $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX908-EXPANDED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX908-EXPANDED: bb.1:
+  ; GFX908-EXPANDED:   successors: %bb.2(0x80000000)
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX908-EXPANDED:   S_NOP 1
+  ; GFX908-EXPANDED: bb.2:
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX908-EXPANDED:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX908-EXPANDED:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX908-EXPANDED:   $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX908-EXPANDED:   $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX908-EXPANDED:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX908-EXPANDED:   $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX908-EXPANDED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr192
+  ; GFX90A-SPILLED: bb.0:
+  ; GFX90A-SPILLED:   successors: %bb.1(0x80000000)
+  ; GFX90A-SPILLED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX90A-SPILLED:   SI_SPILL_A192_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store 24 into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX90A-SPILLED: bb.1:
+  ; GFX90A-SPILLED:   successors: %bb.2(0x80000000)
+  ; GFX90A-SPILLED:   S_NOP 1
+  ; GFX90A-SPILLED: bb.2:
+  ; GFX90A-SPILLED:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_A192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 24 from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr192
+  ; GFX90A-EXPANDED: bb.0:
+  ; GFX90A-EXPANDED:   successors: %bb.1(0x80000000)
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX90A-EXPANDED:   $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX90A-EXPANDED:   $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX90A-EXPANDED:   $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX90A-EXPANDED:   $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX90A-EXPANDED:   $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX90A-EXPANDED:   $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX90A-EXPANDED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX90A-EXPANDED: bb.1:
+  ; GFX90A-EXPANDED:   successors: %bb.2(0x80000000)
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX90A-EXPANDED:   S_NOP 1
+  ; GFX90A-EXPANDED: bb.2:
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX90A-EXPANDED:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX90A-EXPANDED:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX90A-EXPANDED:   $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX90A-EXPANDED:   $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX90A-EXPANDED:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX90A-EXPANDED:   $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
   ; SPILLED-LABEL: name: spill_restore_agpr192
   ; SPILLED: bb.0:
   ; SPILLED:   successors: %bb.1(0x80000000)
@@ -411,6 +925,88 @@ machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
   stackPtrOffsetReg: $sgpr32
 body: |
+  ; GFX908-SPILLED-LABEL: name: spill_restore_agpr256
+  ; GFX908-SPILLED: bb.0:
+  ; GFX908-SPILLED:   successors: %bb.1(0x80000000)
+  ; GFX908-SPILLED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX908-SPILLED:   SI_SPILL_A256_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store 32 into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX908-SPILLED: bb.1:
+  ; GFX908-SPILLED:   successors: %bb.2(0x80000000)
+  ; GFX908-SPILLED:   S_NOP 1
+  ; GFX908-SPILLED: bb.2:
+  ; GFX908-SPILLED:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_A256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 32 from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr256
+  ; GFX908-EXPANDED: bb.0:
+  ; GFX908-EXPANDED:   successors: %bb.1(0x80000000)
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+  ; GFX908-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX908-EXPANDED:   $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX908-EXPANDED:   $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX908-EXPANDED:   $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX908-EXPANDED:   $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX908-EXPANDED:   $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX908-EXPANDED:   $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX908-EXPANDED:   $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX908-EXPANDED:   $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX908-EXPANDED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX908-EXPANDED: bb.1:
+  ; GFX908-EXPANDED:   successors: %bb.2(0x80000000)
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+  ; GFX908-EXPANDED:   S_NOP 1
+  ; GFX908-EXPANDED: bb.2:
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+  ; GFX908-EXPANDED:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX908-EXPANDED:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX908-EXPANDED:   $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX908-EXPANDED:   $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX908-EXPANDED:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX908-EXPANDED:   $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX908-EXPANDED:   $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX908-EXPANDED:   $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX908-EXPANDED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr256
+  ; GFX90A-SPILLED: bb.0:
+  ; GFX90A-SPILLED:   successors: %bb.1(0x80000000)
+  ; GFX90A-SPILLED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX90A-SPILLED:   SI_SPILL_A256_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store 32 into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX90A-SPILLED: bb.1:
+  ; GFX90A-SPILLED:   successors: %bb.2(0x80000000)
+  ; GFX90A-SPILLED:   S_NOP 1
+  ; GFX90A-SPILLED: bb.2:
+  ; GFX90A-SPILLED:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_A256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 32 from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr256
+  ; GFX90A-EXPANDED: bb.0:
+  ; GFX90A-EXPANDED:   successors: %bb.1(0x80000000)
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX90A-EXPANDED:   $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX90A-EXPANDED:   $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX90A-EXPANDED:   $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX90A-EXPANDED:   $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX90A-EXPANDED:   $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX90A-EXPANDED:   $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX90A-EXPANDED:   $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX90A-EXPANDED:   $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX90A-EXPANDED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX90A-EXPANDED: bb.1:
+  ; GFX90A-EXPANDED:   successors: %bb.2(0x80000000)
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+  ; GFX90A-EXPANDED:   S_NOP 1
+  ; GFX90A-EXPANDED: bb.2:
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+  ; GFX90A-EXPANDED:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX90A-EXPANDED:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX90A-EXPANDED:   $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX90A-EXPANDED:   $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX90A-EXPANDED:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX90A-EXPANDED:   $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX90A-EXPANDED:   $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX90A-EXPANDED:   $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
   ; SPILLED-LABEL: name: spill_restore_agpr256
   ; SPILLED: bb.0:
   ; SPILLED:   successors: %bb.1(0x80000000)
@@ -470,6 +1066,120 @@ machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
   stackPtrOffsetReg: $sgpr32
 body: |
+  ; GFX908-SPILLED-LABEL: name: spill_restore_agpr512
+  ; GFX908-SPILLED: bb.0:
+  ; GFX908-SPILLED:   successors: %bb.1(0x80000000)
+  ; GFX908-SPILLED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-SPILLED:   SI_SPILL_A512_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store 64 into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX908-SPILLED: bb.1:
+  ; GFX908-SPILLED:   successors: %bb.2(0x80000000)
+  ; GFX908-SPILLED:   S_NOP 1
+  ; GFX908-SPILLED: bb.2:
+  ; GFX908-SPILLED:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_A512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 64 from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr512
+  ; GFX908-EXPANDED: bb.0:
+  ; GFX908-EXPANDED:   successors: %bb.1(0x80000000)
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+  ; GFX908-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX908-EXPANDED: bb.1:
+  ; GFX908-EXPANDED:   successors: %bb.2(0x80000000)
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+  ; GFX908-EXPANDED:   S_NOP 1
+  ; GFX908-EXPANDED: bb.2:
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+  ; GFX908-EXPANDED:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX908-EXPANDED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr512
+  ; GFX90A-SPILLED: bb.0:
+  ; GFX90A-SPILLED:   successors: %bb.1(0x80000000)
+  ; GFX90A-SPILLED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-SPILLED:   SI_SPILL_A512_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store 64 into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX90A-SPILLED: bb.1:
+  ; GFX90A-SPILLED:   successors: %bb.2(0x80000000)
+  ; GFX90A-SPILLED:   S_NOP 1
+  ; GFX90A-SPILLED: bb.2:
+  ; GFX90A-SPILLED:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_A512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 64 from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr512
+  ; GFX90A-EXPANDED: bb.0:
+  ; GFX90A-EXPANDED:   successors: %bb.1(0x80000000)
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX90A-EXPANDED: bb.1:
+  ; GFX90A-EXPANDED:   successors: %bb.2(0x80000000)
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+  ; GFX90A-EXPANDED:   S_NOP 1
+  ; GFX90A-EXPANDED: bb.2:
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+  ; GFX90A-EXPANDED:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
   ; SPILLED-LABEL: name: spill_restore_agpr512
   ; SPILLED: bb.0:
   ; SPILLED:   successors: %bb.1(0x80000000)
@@ -545,6 +1255,184 @@ machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
   stackPtrOffsetReg: $sgpr32
 body: |
+  ; GFX908-SPILLED-LABEL: name: spill_restore_agpr1024
+  ; GFX908-SPILLED: bb.0:
+  ; GFX908-SPILLED:   successors: %bb.1(0x80000000)
+  ; GFX908-SPILLED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-SPILLED:   SI_SPILL_A1024_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, %stack.0, $sgpr32, 0, implicit $exec :: (store 128 into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX908-SPILLED: bb.1:
+  ; GFX908-SPILLED:   successors: %bb.2(0x80000000)
+  ; GFX908-SPILLED:   S_NOP 1
+  ; GFX908-SPILLED: bb.2:
+  ; GFX908-SPILLED:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_A1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 128 from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr1024
+  ; GFX908-EXPANDED: bb.0:
+  ; GFX908-EXPANDED:   successors: %bb.1(0x80000000)
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
+  ; GFX908-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr16 = V_ACCVGPR_READ_B32_e64 killed $agpr16, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr17 = V_ACCVGPR_READ_B32_e64 killed $agpr17, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr18 = V_ACCVGPR_READ_B32_e64 killed $agpr18, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr19 = V_ACCVGPR_READ_B32_e64 killed $agpr19, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr20 = V_ACCVGPR_READ_B32_e64 killed $agpr20, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr21 = V_ACCVGPR_READ_B32_e64 killed $agpr21, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr22 = V_ACCVGPR_READ_B32_e64 killed $agpr22, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr23 = V_ACCVGPR_READ_B32_e64 killed $agpr23, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr24 = V_ACCVGPR_READ_B32_e64 killed $agpr24, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr25 = V_ACCVGPR_READ_B32_e64 killed $agpr25, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr26 = V_ACCVGPR_READ_B32_e64 killed $agpr26, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr27 = V_ACCVGPR_READ_B32_e64 killed $agpr27, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr28 = V_ACCVGPR_READ_B32_e64 killed $agpr28, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr29 = V_ACCVGPR_READ_B32_e64 killed $agpr29, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr30 = V_ACCVGPR_READ_B32_e64 killed $agpr30, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $vgpr31 = V_ACCVGPR_READ_B32_e64 killed $agpr31, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX908-EXPANDED: bb.1:
+  ; GFX908-EXPANDED:   successors: %bb.2(0x80000000)
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
+  ; GFX908-EXPANDED:   S_NOP 1
+  ; GFX908-EXPANDED: bb.2:
+  ; GFX908-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
+  ; GFX908-EXPANDED:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr16 = V_ACCVGPR_WRITE_B32_e64 $vgpr16, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr17 = V_ACCVGPR_WRITE_B32_e64 $vgpr17, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr18 = V_ACCVGPR_WRITE_B32_e64 $vgpr18, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr19 = V_ACCVGPR_WRITE_B32_e64 $vgpr19, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr20 = V_ACCVGPR_WRITE_B32_e64 $vgpr20, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr21 = V_ACCVGPR_WRITE_B32_e64 $vgpr21, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr22 = V_ACCVGPR_WRITE_B32_e64 $vgpr22, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr23 = V_ACCVGPR_WRITE_B32_e64 $vgpr23, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr24 = V_ACCVGPR_WRITE_B32_e64 $vgpr24, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr25 = V_ACCVGPR_WRITE_B32_e64 $vgpr25, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr26 = V_ACCVGPR_WRITE_B32_e64 $vgpr26, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr27 = V_ACCVGPR_WRITE_B32_e64 $vgpr27, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr28 = V_ACCVGPR_WRITE_B32_e64 $vgpr28, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr29 = V_ACCVGPR_WRITE_B32_e64 $vgpr29, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr30, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr31, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX908-EXPANDED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr1024
+  ; GFX90A-SPILLED: bb.0:
+  ; GFX90A-SPILLED:   successors: %bb.1(0x80000000)
+  ; GFX90A-SPILLED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-SPILLED:   SI_SPILL_A1024_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, %stack.0, $sgpr32, 0, implicit $exec :: (store 128 into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX90A-SPILLED: bb.1:
+  ; GFX90A-SPILLED:   successors: %bb.2(0x80000000)
+  ; GFX90A-SPILLED:   S_NOP 1
+  ; GFX90A-SPILLED: bb.2:
+  ; GFX90A-SPILLED:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_A1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 128 from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr1024
+  ; GFX90A-EXPANDED: bb.0:
+  ; GFX90A-EXPANDED:   successors: %bb.1(0x80000000)
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr16 = V_ACCVGPR_READ_B32_e64 killed $agpr16, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr17 = V_ACCVGPR_READ_B32_e64 killed $agpr17, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr18 = V_ACCVGPR_READ_B32_e64 killed $agpr18, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr19 = V_ACCVGPR_READ_B32_e64 killed $agpr19, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr20 = V_ACCVGPR_READ_B32_e64 killed $agpr20, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr21 = V_ACCVGPR_READ_B32_e64 killed $agpr21, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr22 = V_ACCVGPR_READ_B32_e64 killed $agpr22, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr23 = V_ACCVGPR_READ_B32_e64 killed $agpr23, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr24 = V_ACCVGPR_READ_B32_e64 killed $agpr24, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr25 = V_ACCVGPR_READ_B32_e64 killed $agpr25, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr26 = V_ACCVGPR_READ_B32_e64 killed $agpr26, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr27 = V_ACCVGPR_READ_B32_e64 killed $agpr27, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr28 = V_ACCVGPR_READ_B32_e64 killed $agpr28, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr29 = V_ACCVGPR_READ_B32_e64 killed $agpr29, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr30 = V_ACCVGPR_READ_B32_e64 killed $agpr30, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $vgpr31 = V_ACCVGPR_READ_B32_e64 killed $agpr31, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX90A-EXPANDED: bb.1:
+  ; GFX90A-EXPANDED:   successors: %bb.2(0x80000000)
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
+  ; GFX90A-EXPANDED:   S_NOP 1
+  ; GFX90A-EXPANDED: bb.2:
+  ; GFX90A-EXPANDED:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
+  ; GFX90A-EXPANDED:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr16 = V_ACCVGPR_WRITE_B32_e64 $vgpr16, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr17 = V_ACCVGPR_WRITE_B32_e64 $vgpr17, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr18 = V_ACCVGPR_WRITE_B32_e64 $vgpr18, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr19 = V_ACCVGPR_WRITE_B32_e64 $vgpr19, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr20 = V_ACCVGPR_WRITE_B32_e64 $vgpr20, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr21 = V_ACCVGPR_WRITE_B32_e64 $vgpr21, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr22 = V_ACCVGPR_WRITE_B32_e64 $vgpr22, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr23 = V_ACCVGPR_WRITE_B32_e64 $vgpr23, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr24 = V_ACCVGPR_WRITE_B32_e64 $vgpr24, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr25 = V_ACCVGPR_WRITE_B32_e64 $vgpr25, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr26 = V_ACCVGPR_WRITE_B32_e64 $vgpr26, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr27 = V_ACCVGPR_WRITE_B32_e64 $vgpr27, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr28 = V_ACCVGPR_WRITE_B32_e64 $vgpr28, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr29 = V_ACCVGPR_WRITE_B32_e64 $vgpr29, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr30, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr31, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ; GFX90A-EXPANDED:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
   ; SPILLED-LABEL: name: spill_restore_agpr1024
   ; SPILLED: bb.0:
   ; SPILLED:   successors: %bb.1(0x80000000)

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir
index 13df02ec8bdb..bb0a55843399 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir
@@ -21,12 +21,18 @@ body:             |
 
     ; GCN-LABEL: name: spill_sgpr128_use_subreg
     ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GCN: $sgpr8_sgpr9 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec
+    ; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5)
+    ; GCN: $exec = S_MOV_B64 killed $sgpr8_sgpr9
     ; GCN: renamable $sgpr1 = COPY $sgpr2
     ; GCN: $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
     ; GCN: $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
     ; GCN: $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
     ; GCN: $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
     ; GCN: renamable $sgpr8 = COPY killed renamable $sgpr1
+    ; GCN: $sgpr0_sgpr1 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec
+    ; GCN: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.1, addrspace 5)
+    ; GCN: $exec = S_MOV_B64 killed $sgpr0_sgpr1
     ; GCN: S_ENDPGM 0, implicit $sgpr8
     renamable $sgpr1 = COPY $sgpr2
     SI_SPILL_S128_SAVE renamable $sgpr0_sgpr1_sgpr2_sgpr3, %stack.0, implicit $exec, implicit $sgpr32 :: (store 16 into %stack.0, align 4, addrspace 5)
@@ -51,11 +57,17 @@ body:             |
 
     ; GCN-LABEL: name: spill_sgpr128_use_kill
     ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GCN: $sgpr8_sgpr9 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec
+    ; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5)
+    ; GCN: $exec = S_MOV_B64 killed $sgpr8_sgpr9
     ; GCN: renamable $sgpr1 = COPY $sgpr2
     ; GCN: $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
     ; GCN: $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
     ; GCN: $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
     ; GCN: $vgpr0 = V_WRITELANE_B32 killed $sgpr3, 3, $vgpr0, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GCN: $sgpr0_sgpr1 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec
+    ; GCN: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.1, addrspace 5)
+    ; GCN: $exec = S_MOV_B64 killed $sgpr0_sgpr1
     ; GCN: S_ENDPGM 0
     renamable $sgpr1 = COPY $sgpr2
     SI_SPILL_S128_SAVE renamable killed $sgpr0_sgpr1_sgpr2_sgpr3, %stack.0, implicit $exec, implicit $sgpr32 :: (store 16 into %stack.0, align 4, addrspace 5)
@@ -79,10 +91,10 @@ body:             |
     ; GCN-LABEL: name: spill_vgpr128_use_subreg
     ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
     ; GCN: renamable $vgpr1 = COPY $vgpr2
-    ; GCN: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0, addrspace 5)
-    ; GCN: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 4, addrspace 5)
-    ; GCN: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 8, addrspace 5)
-    ; GCN: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; GCN: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0, addrspace 5)
+    ; GCN: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; GCN: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; GCN: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5)
     ; GCN: renamable $vgpr8 = COPY killed renamable $vgpr1
     ; GCN: S_ENDPGM 0, implicit $vgpr8
     renamable $vgpr1 = COPY $vgpr2
@@ -108,10 +120,10 @@ body:             |
     ; GCN-LABEL: name: spill_vgpr128_use_kill
     ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
     ; GCN: renamable $vgpr1 = COPY $vgpr2
-    ; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0, addrspace 5)
-    ; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 4, addrspace 5)
-    ; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 8, addrspace 5)
-    ; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0, addrspace 5)
+    ; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5)
     ; GCN: S_ENDPGM 0
     renamable $vgpr1 = COPY $vgpr2
     SI_SPILL_V128_SAVE renamable killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5)

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
index 16da59377eb7..7dcecc89048e 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
@@ -50,7 +50,7 @@ body:             |
     ; GFX9: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit $vcc
     ; GFX9: $vcc = S_MOV_B64 $exec
     ; GFX9: $exec = S_MOV_B64 3
-    ; GFX9: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
+    ; GFX9: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
     ; GFX9: $exec = S_MOV_B64 $vcc
     ; GFX9: $vcc_hi = V_READLANE_B32 $vgpr0, 1
     ; GFX9: $vcc_lo = V_READLANE_B32 killed $vgpr0, 0
@@ -59,11 +59,11 @@ body:             |
     ; GFX9: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit killed $vcc
     ; GFX9: $vcc = S_MOV_B64 $exec
     ; GFX9: $exec = S_MOV_B64 3
-    ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
+    ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
     ; GFX9: $exec = S_MOV_B64 killed $vcc
     ; GFX9: $vcc = S_MOV_B64 $exec
     ; GFX9: $exec = S_MOV_B64 3
-    ; GFX9: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+    ; GFX9: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
     ; GFX9: $exec = S_MOV_B64 killed $vcc
     ; GFX9: $vcc_lo = V_READLANE_B32 $vgpr0, 0, implicit-def $vcc
     ; GFX9: $vcc_hi = V_READLANE_B32 killed $vgpr0, 1
@@ -81,7 +81,7 @@ body:             |
     ; GFX10: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit $vcc
     ; GFX10: $vcc = S_MOV_B64 $exec
     ; GFX10: $exec = S_MOV_B64 3
-    ; GFX10: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
+    ; GFX10: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
     ; GFX10: $exec = S_MOV_B64 $vcc
     ; GFX10: $vcc_hi = V_READLANE_B32 $vgpr0, 1
     ; GFX10: $vcc_lo = V_READLANE_B32 killed $vgpr0, 0
@@ -90,11 +90,11 @@ body:             |
     ; GFX10: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit killed $vcc
     ; GFX10: $vcc = S_MOV_B64 $exec
     ; GFX10: $exec = S_MOV_B64 3
-    ; GFX10: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
+    ; GFX10: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
     ; GFX10: $exec = S_MOV_B64 killed $vcc
     ; GFX10: $vcc = S_MOV_B64 $exec
     ; GFX10: $exec = S_MOV_B64 3
-    ; GFX10: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+    ; GFX10: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
     ; GFX10: $exec = S_MOV_B64 killed $vcc
     ; GFX10: $vcc_lo = V_READLANE_B32 $vgpr0, 0, implicit-def $vcc
     ; GFX10: $vcc_hi = V_READLANE_B32 killed $vgpr0, 1

diff  --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
index b51104a169a0..e129067df593 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
@@ -24,10 +24,10 @@ body:             |
     ; CHECK: %3.sub2:sgpr_128 = COPY %2.sub2
     ; CHECK: %3.sub3:sgpr_128 = COPY %2.sub3
     ; CHECK: early-clobber %4:vreg_128, early-clobber %5:vreg_128, early-clobber %6:vreg_128, early-clobber %7:vreg_128 = BUNDLE %3, implicit $exec {
-    ; CHECK:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 128, addrspace 1)
-    ; CHECK:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1)
-    ; CHECK:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 32, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 32, addrspace 1)
-    ; CHECK:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 48, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1)
+    ; CHECK:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 128, addrspace 1)
+    ; CHECK:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1)
+    ; CHECK:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 32, addrspace 1)
+    ; CHECK:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1)
     ; CHECK: }
     ; CHECK: undef %47.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1, implicit $exec
     ; CHECK: SI_SPILL_V128_SAVE %47, %stack.0, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5)
@@ -56,26 +56,26 @@ body:             |
     ; CHECK: undef %113.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec
     ; CHECK: undef %117.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
     ; CHECK: SI_SPILL_V128_SAVE %117, %stack.10, $sgpr32, 0, implicit $exec :: (store 16 into %stack.10, align 4, addrspace 5)
-    ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 64, addrspace 1)
+    ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 64, addrspace 1)
     ; CHECK: undef %122.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec
     ; CHECK: undef %126.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec
     ; CHECK: undef %130.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
     ; CHECK: SI_SPILL_V128_SAVE %130, %stack.11, $sgpr32, 0, implicit $exec :: (store 16 into %stack.11, align 4, addrspace 5)
     ; CHECK: undef %135.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
     ; CHECK: SI_SPILL_V128_SAVE %135, %stack.12, $sgpr32, 0, implicit $exec :: (store 16 into %stack.12, align 4, addrspace 5)
-    ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 80, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1)
+    ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 80, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1)
     ; CHECK: undef %140.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
     ; CHECK: undef %144.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
     ; CHECK: SI_SPILL_V128_SAVE %144, %stack.13, $sgpr32, 0, implicit $exec :: (store 16 into %stack.13, align 4, addrspace 5)
     ; CHECK: undef %149.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
     ; CHECK: SI_SPILL_V128_SAVE %149, %stack.14, $sgpr32, 0, implicit $exec :: (store 16 into %stack.14, align 4, addrspace 5)
     ; CHECK: undef %154.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
-    ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 96, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 32, addrspace 1)
+    ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 96, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 32, addrspace 1)
     ; CHECK: undef %158.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
     ; CHECK: undef %36.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec
     ; CHECK: undef %37.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec
     ; CHECK: undef %38.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec
-    ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 112, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1)
+    ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 112, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1)
     ; CHECK: undef %40.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub1, implicit $exec
     ; CHECK: undef %41.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub0, implicit $exec
     ; CHECK: undef %42.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub3, implicit $exec
@@ -155,190 +155,190 @@ body:             |
     ; CHECK: %43.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub2, implicit $exec
     ; CHECK: %43.sub1:vreg_128 = V_MOV_B32_e32 0, implicit $exec
     ; CHECK: %43.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %43, %2, 0, 480, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %43, %2, 0, 480, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
     ; CHECK: %42.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %42.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %42, %2, 0, 496, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %42, %2, 0, 496, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     ; CHECK: %41.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %41.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %41, %2, 0, 448, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %41, %2, 0, 448, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1)
     ; CHECK: %40.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %40.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %40, %2, 0, 464, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %40, %2, 0, 464, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     ; CHECK: %38.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %38.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %38, %2, 0, 416, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %38, %2, 0, 416, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
     ; CHECK: %37.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %37.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %37, %2, 0, 432, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %37, %2, 0, 432, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     ; CHECK: %36.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %36.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %36, %2, 0, 384, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 128, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %36, %2, 0, 384, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 128, addrspace 1)
     ; CHECK: undef %157.sub0:vreg_128 = COPY %159.sub0 {
     ; CHECK:   internal %157.sub2:vreg_128 = COPY %159.sub2
     ; CHECK: }
     ; CHECK: %157.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %157.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %157, %2, 0, 400, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %157, %2, 0, 400, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     ; CHECK: undef %153.sub0:vreg_128 = COPY %155.sub0 {
     ; CHECK:   internal %153.sub2:vreg_128 = COPY %155.sub2
     ; CHECK: }
     ; CHECK: %153.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %153.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %153, %2, 0, 352, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %153, %2, 0, 352, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
     ; CHECK: [[SI_SPILL_V128_RESTORE15:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.14, $sgpr32, 0, implicit $exec :: (load 16 from %stack.14, align 4, addrspace 5)
     ; CHECK: undef %148.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE15]].sub0 {
     ; CHECK:   internal %148.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE15]].sub2
     ; CHECK: }
     ; CHECK: %148.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %148.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %148, %2, 0, 368, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %148, %2, 0, 368, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     ; CHECK: [[SI_SPILL_V128_RESTORE16:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.13, $sgpr32, 0, implicit $exec :: (load 16 from %stack.13, align 4, addrspace 5)
     ; CHECK: undef %143.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE16]].sub0 {
     ; CHECK:   internal %143.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE16]].sub2
     ; CHECK: }
     ; CHECK: %143.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %143.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %143, %2, 0, 320, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %143, %2, 0, 320, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1)
     ; CHECK: undef %139.sub0:vreg_128 = COPY %141.sub0 {
     ; CHECK:   internal %139.sub2:vreg_128 = COPY %141.sub2
     ; CHECK: }
     ; CHECK: %139.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %139.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %139, %2, 0, 336, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %139, %2, 0, 336, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     ; CHECK: [[SI_SPILL_V128_RESTORE17:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.12, $sgpr32, 0, implicit $exec :: (load 16 from %stack.12, align 4, addrspace 5)
     ; CHECK: undef %134.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE17]].sub0 {
     ; CHECK:   internal %134.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE17]].sub2
     ; CHECK: }
     ; CHECK: %134.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %134.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %134, %2, 0, 288, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %134, %2, 0, 288, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
     ; CHECK: [[SI_SPILL_V128_RESTORE18:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load 16 from %stack.11, align 4, addrspace 5)
     ; CHECK: undef %129.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE18]].sub0 {
     ; CHECK:   internal %129.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE18]].sub2
     ; CHECK: }
     ; CHECK: %129.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %129.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %129, %2, 0, 304, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %129, %2, 0, 304, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     ; CHECK: undef %125.sub0:vreg_128 = COPY %127.sub0 {
     ; CHECK:   internal %125.sub2:vreg_128 = COPY %127.sub2
     ; CHECK: }
     ; CHECK: %125.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %125.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %125, %2, 0, 256, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 256, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %125, %2, 0, 256, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 256, addrspace 1)
     ; CHECK: undef %121.sub0:vreg_128 = COPY %123.sub0 {
     ; CHECK:   internal %121.sub2:vreg_128 = COPY %123.sub2
     ; CHECK: }
     ; CHECK: %121.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %121.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %121, %2, 0, 272, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %121, %2, 0, 272, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     ; CHECK: [[SI_SPILL_V128_RESTORE19:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load 16 from %stack.10, align 4, addrspace 5)
     ; CHECK: undef %116.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE19]].sub0 {
     ; CHECK:   internal %116.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE19]].sub2
     ; CHECK: }
     ; CHECK: %116.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %116.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %116, %2, 0, 224, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %116, %2, 0, 224, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
     ; CHECK: undef %112.sub0:vreg_128 = COPY %114.sub0 {
     ; CHECK:   internal %112.sub2:vreg_128 = COPY %114.sub2
     ; CHECK: }
     ; CHECK: %112.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %112.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %112, %2, 0, 240, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %112, %2, 0, 240, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     ; CHECK: undef %108.sub0:vreg_128 = COPY %110.sub0 {
     ; CHECK:   internal %108.sub2:vreg_128 = COPY %110.sub2
     ; CHECK: }
     ; CHECK: %108.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %108.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %108, %2, 0, 192, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %108, %2, 0, 192, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1)
     ; CHECK: undef %104.sub0:vreg_128 = COPY %106.sub0 {
     ; CHECK:   internal %104.sub2:vreg_128 = COPY %106.sub2
     ; CHECK: }
     ; CHECK: %104.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %104.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %104, %2, 0, 208, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %104, %2, 0, 208, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     ; CHECK: [[SI_SPILL_V128_RESTORE20:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load 16 from %stack.9, align 4, addrspace 5)
     ; CHECK: undef %99.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE20]].sub0 {
     ; CHECK:   internal %99.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE20]].sub2
     ; CHECK: }
     ; CHECK: %99.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %99.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %99, %2, 0, 160, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %99, %2, 0, 160, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
     ; CHECK: [[SI_SPILL_V128_RESTORE21:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load 16 from %stack.8, align 4, addrspace 5)
     ; CHECK: undef %94.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE21]].sub0 {
     ; CHECK:   internal %94.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE21]].sub2
     ; CHECK: }
     ; CHECK: %94.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %94.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %94, %2, 0, 176, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %94, %2, 0, 176, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     ; CHECK: [[SI_SPILL_V128_RESTORE22:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load 16 from %stack.7, align 4, addrspace 5)
     ; CHECK: undef %89.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE22]].sub0 {
     ; CHECK:   internal %89.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE22]].sub2
     ; CHECK: }
     ; CHECK: %89.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %89.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %89, %2, 0, 128, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 128, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %89, %2, 0, 128, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 128, addrspace 1)
     ; CHECK: undef %85.sub0:vreg_128 = COPY %87.sub0 {
     ; CHECK:   internal %85.sub2:vreg_128 = COPY %87.sub2
     ; CHECK: }
     ; CHECK: %85.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %85.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %85, %2, 0, 144, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %85, %2, 0, 144, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     ; CHECK: [[SI_SPILL_V128_RESTORE23:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load 16 from %stack.6, align 4, addrspace 5)
     ; CHECK: undef %80.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE23]].sub0 {
     ; CHECK:   internal %80.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE23]].sub2
     ; CHECK: }
     ; CHECK: %80.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %80.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %80, %2, 0, 96, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %80, %2, 0, 96, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
     ; CHECK: [[SI_SPILL_V128_RESTORE24:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load 16 from %stack.5, align 4, addrspace 5)
     ; CHECK: undef %75.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE24]].sub0 {
     ; CHECK:   internal %75.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE24]].sub2
     ; CHECK: }
     ; CHECK: %75.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %75.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %75, %2, 0, 112, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %75, %2, 0, 112, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     ; CHECK: [[SI_SPILL_V128_RESTORE25:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load 16 from %stack.4, align 4, addrspace 5)
     ; CHECK: undef %70.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE25]].sub0 {
     ; CHECK:   internal %70.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE25]].sub2
     ; CHECK: }
     ; CHECK: %70.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %70.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %70, %2, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %70, %2, 0, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1)
     ; CHECK: undef %66.sub0:vreg_128 = COPY %68.sub0 {
     ; CHECK:   internal %66.sub2:vreg_128 = COPY %68.sub2
     ; CHECK: }
     ; CHECK: %66.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %66.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %66, %2, 0, 80, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %66, %2, 0, 80, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     ; CHECK: [[SI_SPILL_V128_RESTORE26:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load 16 from %stack.3, align 4, addrspace 5)
     ; CHECK: undef %61.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE26]].sub0 {
     ; CHECK:   internal %61.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE26]].sub2
     ; CHECK: }
     ; CHECK: %61.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %61.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %61, %2, 0, 32, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %61, %2, 0, 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
     ; CHECK: [[SI_SPILL_V128_RESTORE27:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load 16 from %stack.2, align 4, addrspace 5)
     ; CHECK: undef %56.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE27]].sub0 {
     ; CHECK:   internal %56.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE27]].sub2
     ; CHECK: }
     ; CHECK: %56.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %56.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %56, %2, 0, 48, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %56, %2, 0, 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     ; CHECK: [[SI_SPILL_V128_RESTORE28:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load 16 from %stack.1, align 4, addrspace 5)
     ; CHECK: undef %51.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE28]].sub0 {
     ; CHECK:   internal %51.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE28]].sub2
     ; CHECK: }
     ; CHECK: %51.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %51.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %51, %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 512, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %51, %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 512, addrspace 1)
     ; CHECK: [[SI_SPILL_V128_RESTORE29:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 16 from %stack.0, align 4, addrspace 5)
     ; CHECK: undef %46.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE29]].sub0 {
     ; CHECK:   internal %46.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE29]].sub2
     ; CHECK: }
     ; CHECK: %46.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %46.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %46, %2, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %46, %2, 0, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     ; CHECK: S_ENDPGM 0
     %0:sgpr_64(p4) = COPY $sgpr0_sgpr1
     %1:sgpr_128 = S_LOAD_DWORDX4_IMM %0(p4), 9, 0, 0 :: (dereferenceable invariant load 16, align 4, addrspace 4)
@@ -351,10 +351,10 @@ body:             |
     %3.sub2:sgpr_128 = COPY %2.sub2
     %3.sub3:sgpr_128 = COPY %2.sub3
     early-clobber %4:vreg_128, early-clobber %5:vreg_128, early-clobber %6:vreg_128, early-clobber %7:vreg_128 = BUNDLE %3, implicit $exec {
-      %7:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 128, addrspace 1)
-      %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1)
-      %4:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 32, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 32, addrspace 1)
-      %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 48, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1)
+      %7:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 128, addrspace 1)
+      %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1)
+      %4:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 32, addrspace 1)
+      %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1)
     }
     undef %8.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %7.sub1, implicit $exec
     undef %9.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %7.sub0, implicit $exec
@@ -372,22 +372,22 @@ body:             |
     undef %21.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %6.sub0, implicit $exec
     undef %22.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %6.sub3, implicit $exec
     undef %23.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %6.sub2, implicit $exec
-    %24:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 64, addrspace 1)
+    %24:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 64, addrspace 1)
     undef %25.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %24.sub1, implicit $exec
     undef %26.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %24.sub0, implicit $exec
     undef %27.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %24.sub3, implicit $exec
     undef %28.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %24.sub2, implicit $exec
-    %29:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 80, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1)
+    %29:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 80, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1)
     undef %30.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %29.sub1, implicit $exec
     undef %31.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %29.sub0, implicit $exec
     undef %32.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %29.sub3, implicit $exec
     undef %33.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %29.sub2, implicit $exec
-    %34:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 96, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 32, addrspace 1)
+    %34:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 96, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 32, addrspace 1)
     undef %35.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %34.sub1, implicit $exec
     undef %36.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %34.sub0, implicit $exec
     undef %37.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %34.sub3, implicit $exec
     undef %38.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %34.sub2, implicit $exec
-    %39:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 112, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1)
+    %39:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 112, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1)
     undef %40.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %39.sub1, implicit $exec
     undef %41.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %39.sub0, implicit $exec
     undef %42.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %39.sub3, implicit $exec
@@ -427,99 +427,99 @@ body:             |
     %43.sub0:vreg_128 = V_AND_B32_e32 %44, %39.sub2, implicit $exec
     %43.sub1:vreg_128 = V_MOV_B32_e32 0, implicit $exec
     %43.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %43, %2, 0, 480, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %43, %2, 0, 480, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
     %42.sub1:vreg_128 = COPY %43.sub1
     %42.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %42, %2, 0, 496, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %42, %2, 0, 496, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     %41.sub1:vreg_128 = COPY %43.sub1
     %41.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %41, %2, 0, 448, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %41, %2, 0, 448, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1)
     %40.sub1:vreg_128 = COPY %43.sub1
     %40.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %40, %2, 0, 464, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %40, %2, 0, 464, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     %38.sub1:vreg_128 = COPY %43.sub1
     %38.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %38, %2, 0, 416, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %38, %2, 0, 416, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
     %37.sub1:vreg_128 = COPY %43.sub1
     %37.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %37, %2, 0, 432, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %37, %2, 0, 432, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     %36.sub1:vreg_128 = COPY %43.sub1
     %36.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %36, %2, 0, 384, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 128, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %36, %2, 0, 384, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 128, addrspace 1)
     %35.sub1:vreg_128 = COPY %43.sub1
     %35.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %35, %2, 0, 400, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %35, %2, 0, 400, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     %33.sub1:vreg_128 = COPY %43.sub1
     %33.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %33, %2, 0, 352, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %33, %2, 0, 352, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
     %32.sub1:vreg_128 = COPY %43.sub1
     %32.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %32, %2, 0, 368, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %32, %2, 0, 368, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     %31.sub1:vreg_128 = COPY %43.sub1
     %31.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %31, %2, 0, 320, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %31, %2, 0, 320, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1)
     %30.sub1:vreg_128 = COPY %43.sub1
     %30.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %30, %2, 0, 336, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %30, %2, 0, 336, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     %28.sub1:vreg_128 = COPY %43.sub1
     %28.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %28, %2, 0, 288, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %28, %2, 0, 288, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
     %27.sub1:vreg_128 = COPY %43.sub1
     %27.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %27, %2, 0, 304, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %27, %2, 0, 304, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     %26.sub1:vreg_128 = COPY %43.sub1
     %26.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %26, %2, 0, 256, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 256, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %26, %2, 0, 256, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 256, addrspace 1)
     %25.sub1:vreg_128 = COPY %43.sub1
     %25.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %25, %2, 0, 272, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %25, %2, 0, 272, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     %23.sub1:vreg_128 = COPY %43.sub1
     %23.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %23, %2, 0, 224, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %23, %2, 0, 224, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
     %22.sub1:vreg_128 = COPY %43.sub1
     %22.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %22, %2, 0, 240, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %22, %2, 0, 240, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     %21.sub1:vreg_128 = COPY %43.sub1
     %21.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %21, %2, 0, 192, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %21, %2, 0, 192, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1)
     %20.sub1:vreg_128 = COPY %43.sub1
     %20.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %20, %2, 0, 208, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %20, %2, 0, 208, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     %19.sub1:vreg_128 = COPY %43.sub1
     %19.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %19, %2, 0, 160, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %19, %2, 0, 160, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
     %18.sub1:vreg_128 = COPY %43.sub1
     %18.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %18, %2, 0, 176, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %18, %2, 0, 176, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     %17.sub1:vreg_128 = COPY %43.sub1
     %17.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %17, %2, 0, 128, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 128, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %17, %2, 0, 128, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 128, addrspace 1)
     %16.sub1:vreg_128 = COPY %43.sub1
     %16.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %16, %2, 0, 144, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %16, %2, 0, 144, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     %15.sub1:vreg_128 = COPY %43.sub1
     %15.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %15, %2, 0, 96, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %15, %2, 0, 96, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
     %14.sub1:vreg_128 = COPY %43.sub1
     %14.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %14, %2, 0, 112, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %14, %2, 0, 112, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     %13.sub1:vreg_128 = COPY %43.sub1
     %13.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %13, %2, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %13, %2, 0, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1)
     %12.sub1:vreg_128 = COPY %43.sub1
     %12.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %12, %2, 0, 80, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %12, %2, 0, 80, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     %11.sub1:vreg_128 = COPY %43.sub1
     %11.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %11, %2, 0, 32, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %11, %2, 0, 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1)
     %10.sub1:vreg_128 = COPY %43.sub1
     %10.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %10, %2, 0, 48, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %10, %2, 0, 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     %9.sub1:vreg_128 = COPY %43.sub1
     %9.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %9, %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 512, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %9, %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 512, addrspace 1)
     %8.sub1:vreg_128 = COPY %43.sub1
     %8.sub3:vreg_128 = COPY %43.sub1
-    BUFFER_STORE_DWORDX4_OFFSET %8, %2, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    BUFFER_STORE_DWORDX4_OFFSET %8, %2, 0, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     S_ENDPGM 0
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index 4be88c912c3b..f477e1b12583 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -46,9 +46,9 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK:   %71.sub3:sgpr_128 = S_MOV_B32 553734060
   ; CHECK:   %71.sub2:sgpr_128 = S_MOV_B32 -1
   ; CHECK:   early-clobber %87:vgpr_32, early-clobber %117:vgpr_32, early-clobber %76:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM1]], undef %118:sgpr_128, undef %89:sgpr_128, [[V_MOV_B32_e32_]], implicit $exec {
-  ; CHECK:     [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:     [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:     [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:     [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:     [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:     [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   }
   ; CHECK:   SI_SPILL_S128_SAVE %71, %stack.1, implicit $exec, implicit $sgpr32 :: (store 16 into %stack.1, align 4, addrspace 5)
   ; CHECK:   %71.sub1:sgpr_128 = S_MOV_B32 0
@@ -110,9 +110,9 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK:   [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %190, 0, 0, 0 :: (load 16 from %ir.123, addrspace 4)
   ; CHECK:   [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %200, 0, 0, 0 :: (load 16 from %ir.131, addrspace 4)
   ; CHECK:   [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %210, 0, 0, 0 :: (load 16 from %ir.138, addrspace 4)
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_BUFFER_LOAD_DWORD_SGPR4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %364:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load 4)
   ; CHECK:   [[S_BUFFER_LOAD_DWORD_SGPR5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %375:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load 4)
   ; CHECK:   [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR]], -98, implicit-def dead $scc
@@ -129,17 +129,17 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK:   %351.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
   ; CHECK:   [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY9]], 4, implicit-def dead $scc
   ; CHECK:   [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_BUFFER_LOAD_DWORD_SGPR6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %396:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load 4)
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %50, 224, 0, 0 :: (load 16 from %ir.155, addrspace 4)
   ; CHECK:   [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %217, 0, 0, 0 :: (load 16 from %ir.144, addrspace 4)
   ; CHECK:   [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %224, 0, 0, 0 :: (load 16 from %ir.150, addrspace 4)
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %241, 0, 0, 0 :: (load 16 from %ir.162, addrspace 4)
   ; CHECK:   [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %253, 0, 0, 0 :: (load 16 from %ir.170, addrspace 4)
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR4]], -217, implicit-def dead $scc
   ; CHECK:   [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -233, implicit-def dead $scc
   ; CHECK:   [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR5]], -249, implicit-def dead $scc
@@ -153,48 +153,48 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK:   undef %411.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_2]], implicit-def $scc
   ; CHECK:   %411.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
   ; CHECK:   [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY10]], 4, implicit-def dead $scc
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
   ; CHECK:   undef %425.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_4]], implicit-def $scc
   ; CHECK:   %425.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
   ; CHECK:   [[S_ADD_U32_4:%[0-9]+]]:sreg_32 = S_ADD_U32 %56.sub0, 168, implicit-def $scc
   ; CHECK:   [[S_ADDC_U32_4:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %57:sreg_32, 0, implicit-def dead $scc, implicit $scc
   ; CHECK:   [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
   ; CHECK:   undef %441.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_5]], implicit-def $scc
   ; CHECK:   %441.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   %71.sub0:sgpr_128 = S_LOAD_DWORD_IMM %441, 0, 0, 0 :: (load 4 from %ir..i085.i, align 8, addrspace 4)
   ; CHECK:   [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %261, 0, 0, 0 :: (load 16 from %ir.176, addrspace 4)
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %273, 0, 0, 0 :: (load 16 from %ir.185, addrspace 4)
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %286, 0, 0, 0 :: (load 16 from %ir.194, addrspace 4)
   ; CHECK:   [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4)
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %293, 0, 0, 0 :: (load 16 from %ir.200, addrspace 4)
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 3, implicit-def dead $scc
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc
   ; CHECK:   [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
   ; CHECK:   undef %453.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_6]], implicit-def $scc
   ; CHECK:   %453.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
   ; CHECK:   %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %453, 0, 0, 0 :: (load 8 from %ir.304, addrspace 4)
-  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %327, 0, 0, 0 :: (load 16 from %ir.223, addrspace 4)
   ; CHECK:   [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0, 0 :: (load 16 from %ir.230, addrspace 4)
   ; CHECK:   [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0, 0 :: (load 16 from %ir.236, addrspace 4)
   ; CHECK:   [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %351, 0, 0, 0 :: (load 16 from %ir.242, addrspace 4)
   ; CHECK:   %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc
   ; CHECK:   [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4)
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 3, implicit-def dead $scc
-  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:   [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc
   ; CHECK:   [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc
   ; CHECK:   undef %468.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_7]], implicit-def $scc
@@ -214,8 +214,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK:   %485.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
   ; CHECK:   %71.sub0:sgpr_128 = S_LOAD_DWORD_IMM %485, 0, 0, 0 :: (load 4 from %ir..i0100.i, align 8, addrspace 4)
   ; CHECK:   early-clobber %413:vgpr_32, early-clobber %427:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM23]], [[S_LOAD_DWORDX4_IMM24]], [[V_MOV_B32_e32_]], implicit $exec {
-  ; CHECK:     [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:     [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:     [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:     [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   }
   ; CHECK:   %71.sub1:sgpr_128 = S_AND_B32 [[S_LOAD_DWORD_IMM]], [[S_MOV_B32_]], implicit-def dead $scc
   ; CHECK:   [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4)
@@ -238,9 +238,9 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK:   %530.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
   ; CHECK:   [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %530, 0, 0, 0 :: (load 16 from %ir.359, addrspace 4)
   ; CHECK:   early-clobber %516:vgpr_32, early-clobber %532:vgpr_32, early-clobber %524:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM26]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], [[S_LOAD_DWORDX4_IMM27]], implicit $exec {
-  ; CHECK:     [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:     [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
-  ; CHECK:     [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:     [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:     [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
+  ; CHECK:     [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   }
   ; CHECK:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec
   ; CHECK:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir b/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir
index 7dd8a4092a69..09196b7f35a6 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir
+++ b/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir
@@ -4,16 +4,10 @@
 # CHECK-LABEL: name: no_merge_sgpr_vgpr_spill_slot{{$}}
 # CHECK: stack:
 # CHECK:   - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4,
-# CHECK-NEXT: stack-id: default,
-
-# CHECK: - { id: 1, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4,
 # CHECK-NEXT: stack-id: sgpr-spill,
 
-# CHECK: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
-# CHECK: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
-
-# CHECK: SI_SPILL_S32_SAVE killed renamable $sgpr5, %stack.1, implicit $exec, implicit $sgpr32 :: (store 4 into %stack.1, addrspace 5)
-# CHECK: $sgpr5 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load 4 from %stack.1, addrspace 5)
+# CHECK: SI_SPILL_S32_SAVE killed renamable $sgpr5, %stack.0, implicit $exec, implicit $sgpr32 :: (store 4 into %stack.0, addrspace 5)
+# CHECK: renamable $sgpr5 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load 4 from %stack.0, addrspace 5)
 
 name: no_merge_sgpr_vgpr_spill_slot
 tracksRegLiveness: true
@@ -23,8 +17,8 @@ machineFunctionInfo:
   stackPtrOffsetReg: $sgpr32
 body: |
   bb.0:
-    %0:vgpr_32 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, 0, implicit $flat_scr, implicit $exec
-    %2:vgpr_32 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, 0, implicit $flat_scr, implicit $exec
+    %2:vgpr_32 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $flat_scr, implicit $exec
+    %0:vgpr_32 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $flat_scr, implicit $exec
     S_NOP 0, implicit %0
     %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef $sgpr0_sgpr1, 0, 0, 0
     %3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef $sgpr0_sgpr1, 0, 0, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir b/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir
index 7b6b658deb9a..53c6139d645b 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir
+++ b/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir
@@ -97,7 +97,7 @@ body: |
     %11.sub5:sgpr_256 = COPY %11.sub0
     %11.sub6:sgpr_256 = COPY %11.sub0
     %11.sub7:sgpr_256 = COPY %11.sub0
-    %12:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %9, %11, undef %13:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4)
+    %12:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %9, %11, undef %13:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4)
     %14:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
     %15:vreg_128 = IMPLICIT_DEF
     S_CBRANCH_SCC1 %bb.8, implicit undef $scc
@@ -268,7 +268,7 @@ body: |
     %62:vgpr_32 = V_MOV_B32_e32 1033100696, implicit $exec
     %63:vgpr_32 = V_MUL_F32_e32 1060575065, %15.sub1, implicit $mode, implicit $exec
     %63:vgpr_32 = V_MAC_F32_e32 1046066128, %15.sub0, %63, implicit $mode, implicit $exec
-    %64:vgpr_32 = IMAGE_LOAD_V1_V2 %60, %61, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4)
+    %64:vgpr_32 = IMAGE_LOAD_V1_V2 %60, %61, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4)
     %64:vgpr_32 = V_MAC_F32_e32 target-flags(amdgpu-gotprel) 0, %51.sub0, %64, implicit $mode, implicit $exec
     %65:vgpr_32 = V_MUL_F32_e32 0, %64, implicit $mode, implicit $exec
     %66:vgpr_32 = V_MUL_F32_e32 0, %65, implicit $mode, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/subvector-test.mir b/llvm/test/CodeGen/AMDGPU/subvector-test.mir
index 8fd27d1c62d3..fd6ddea87f8e 100644
--- a/llvm/test/CodeGen/AMDGPU/subvector-test.mir
+++ b/llvm/test/CodeGen/AMDGPU/subvector-test.mir
@@ -31,6 +31,6 @@ body:             |
 
   bb.2:
 
-    GLOBAL_STORE_DWORD %15, %16, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %15, %16, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/syncscopes.ll b/llvm/test/CodeGen/AMDGPU/syncscopes.ll
index f9e08628c08f..5b5f9843d449 100644
--- a/llvm/test/CodeGen/AMDGPU/syncscopes.ll
+++ b/llvm/test/CodeGen/AMDGPU/syncscopes.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -stop-after=si-insert-skips < %s | FileCheck --check-prefix=GCN %s
 
 ; GCN-LABEL: name: syncscopes
-; GCN: FLAT_STORE_DWORD killed renamable $vgpr1_vgpr2, killed renamable $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("agent") seq_cst 4 into %ir.agent_out)
-; GCN: FLAT_STORE_DWORD killed renamable $vgpr4_vgpr5, killed renamable $vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out)
-; GCN: FLAT_STORE_DWORD killed renamable $vgpr7_vgpr8, killed renamable $vgpr6, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out)
+; GCN: FLAT_STORE_DWORD killed renamable $vgpr1_vgpr2, killed renamable $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("agent") seq_cst 4 into %ir.agent_out)
+; GCN: FLAT_STORE_DWORD killed renamable $vgpr4_vgpr5, killed renamable $vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out)
+; GCN: FLAT_STORE_DWORD killed renamable $vgpr7_vgpr8, killed renamable $vgpr6, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out)
 define void @syncscopes(
     i32 %agent,
     i32* %agent_out,

diff  --git a/llvm/test/CodeGen/AMDGPU/tgsplit.ll b/llvm/test/CodeGen/AMDGPU/tgsplit.ll
new file mode 100644
index 000000000000..9480cf5790ac
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/tgsplit.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GCN,NOTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GCN,TGSPLIT %s
+
+; GCN-LABEL: .amdhsa_kernel test
+; NOTGSPLIT: .amdhsa_tg_split 0
+; NOTGSPLIT: COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: 0
+; TGSPLIT:   .amdhsa_tg_split 1
+; TGSPLIT:   COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: 1
+define amdgpu_kernel void @test() {
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
index c6bdba306fac..4bff6fdaad7b 100644
--- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
+++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
@@ -25,7 +25,7 @@ define amdgpu_ps float @test_return_to_epilog_into_end_block(i32 inreg %a, float
   ; GCN: bb.2.else:
   ; GCN:   successors:
   ; GCN:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; GCN:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
   ; GCN: bb.3:
 entry:
   %cc = icmp sgt i32 %a, 0
@@ -61,7 +61,7 @@ define amdgpu_ps float @test_unify_return_to_epilog_into_end_block(i32 inreg %a,
   ; GCN: bb.4.else:
   ; GCN:   successors:
   ; GCN:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; GCN:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
   ; GCN: bb.5:
 entry:
   %cc = icmp sgt i32 %a, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir
new file mode 100644
index 000000000000..73d3fae9d120
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir
@@ -0,0 +1,186 @@
+# RUN: llc -march=amdgcn -mcpu=gfx90a %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: name: test_fmamk_reg_imm_f64
+# GCN: V_FMA_F64_e64 0, killed %0, 0, %2, 0, killed %1, 0, 0, implicit $mode, implicit $exec
+---
+name:            test_fmamk_reg_imm_f64
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+  - { id: 3, class: vreg_64 }
+body:             |
+  bb.0:
+
+    %0 = IMPLICIT_DEF
+    %1 = COPY %0
+    %2 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec
+    %3 = V_FMAC_F64_e32 killed %0, %2, killed %1, implicit $mode, implicit $exec
+
+...
+
+# GCN-LABEL: name: test_fmamk_imm_reg_f64
+# GCN: V_FMA_F64_e64 0, %2, 0, killed %0.sub0_sub1, 0, killed %1, 0, 0, implicit $mode, implicit $exec
+---
+name:            test_fmamk_imm_reg_f64
+registers:
+  - { id: 0, class: vreg_128 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+  - { id: 3, class: vreg_64 }
+body:             |
+  bb.0:
+
+    %0 = IMPLICIT_DEF
+    %1 = COPY %0.sub2_sub3
+    %2 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec
+    %3 = V_FMAC_F64_e32 %2, killed %0.sub0_sub1, killed %1, implicit $mode, implicit $exec
+
+...
+
+# GCN-LABEL: name: test_fmaak_f64
+# GCN: V_FMA_F64_e64 0, killed %0.sub0_sub1, 0, %0.sub2_sub3, 0, %1, 0, 0, implicit $mode, implicit $exec
+---
+name:            test_fmaak_f64
+registers:
+  - { id: 0, class: vreg_128 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+body:             |
+  bb.0:
+
+    %0 = IMPLICIT_DEF
+    %1 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec
+    %2 = V_FMAC_F64_e32 killed %0.sub0_sub1, %0.sub2_sub3, %1, implicit $mode, implicit $exec
+
+...
+
+# GCN-LABEL: name: test_fmaak_sgpr_src0_f64
+# GCN: V_FMA_F64_e64 0, killed %0, 0, %1, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec
+
+---
+name:            test_fmaak_sgpr_src0_f64
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+  - { id: 3, class: vreg_64 }
+body:             |
+  bb.0:
+
+    %0 = IMPLICIT_DEF
+    %1 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec
+    %3 = V_FMAC_F64_e32 killed %0, %1, %2, implicit $mode, implicit $exec
+
+...
+
+# GCN-LABEL: name: test_fmaak_inlineimm_src0_f64
+# GCN: V_FMA_F64_e64 0, 4611686018427387904, 0, %0, 0, %1:vreg_64, 0, 0, implicit $mode, implicit $exec
+
+---
+name:            test_fmaak_inlineimm_src0_f64
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+body:             |
+  bb.0:
+
+    %0 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec
+    %2 = V_FMAC_F64_e32 4611686018427387904, %0, %1, implicit $mode, implicit $exec
+
+...
+
+# GCN-LABEL: name: test_fmaak_otherimm_src0_f64
+# GCN: V_FMA_F64_e64 0, 4611686018427387904, 0, %0, 0, %1:vreg_64, 0, 0, implicit $mode, implicit $exec
+
+---
+name:            test_fmaak_otherimm_src0_f64
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+body:             |
+  bb.0:
+
+    %0 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec
+    %2 = V_FMAC_F64_e32 4611686018427387904, %0, %1, implicit $mode, implicit $exec
+
+...
+
+# GCN-LABEL: name: test_fmaak_other_constantlike_src0_f64
+# GCN: V_FMAC_F64_e32 %stack.0, %0, %2, implicit $mode, implicit $exec
+---
+name:            test_fmaak_other_constantlike_src0_f64
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+stack:
+  - { id: 0, name: "", type: default, offset: 0, size: 128, alignment: 8,
+      callee-saved-register: '', local-offset: 0, debug-info-variable: '',
+      debug-info-expression: '', debug-info-location: '' }
+body:             |
+  bb.0:
+
+    %0 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec
+    %2 = V_FMAC_F64_e32 %stack.0, %0, %1, implicit $mode, implicit $exec
+
+...
+
+# GCN-LABEL: name: test_fmamk_reg_unfoldable_literal_src0_f64
+# GCN: V_FMA_F64_e64 0, %2, 0, killed %0, 0, killed %1, 0, 0, implicit $mode, implicit $exec
+---
+name:            test_fmamk_reg_unfoldable_literal_src0_f64
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+  - { id: 3, class: vreg_64 }
+body:             |
+  bb.0:
+
+    %0 = IMPLICIT_DEF
+    %1 = COPY %0
+    %2 = V_MOV_B64_PSEUDO 123456, implicit $exec
+    %3 = V_FMAC_F64_e32 %2, killed %0, killed %1, implicit $mode, implicit $exec
+
+...
+
+# GCN-LABEL: name: test_fmamk_reg_unfoldable_literal_src1_f64
+# GCN: V_FMA_F64_e64 0, killed %0, 0, %2, 0, killed %1, 0, 0, implicit $mode, implicit $exec
+---
+name:            test_fmamk_reg_unfoldable_literal_src1_f64
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+  - { id: 3, class: vreg_64 }
+body:             |
+  bb.0:
+
+    %0 = IMPLICIT_DEF
+    %1 = COPY %0
+    %2 = V_MOV_B64_PSEUDO 123456, implicit $exec
+    %3 = V_FMAC_F64_e32 killed %0, %2, killed %1, implicit $mode, implicit $exec
+
+...
+
+# GCN-LABEL: name: test_fmaak_reg_unfoldable_literal_src2_f64
+# GCN: V_FMA_F64_e64 0, killed %0, 0, killed %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+---
+name:            test_fmaak_reg_unfoldable_literal_src2_f64
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+  - { id: 3, class: vreg_64 }
+body:             |
+  bb.0:
+
+    %0 = IMPLICIT_DEF
+    %1 = COPY %0
+    %2 = V_MOV_B64_PSEUDO 123456, implicit $exec
+    %3 = V_FMAC_F64_e32 killed %0, killed %1, %2, implicit $mode, implicit $exec
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/undefined-physreg-sgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/undefined-physreg-sgpr-spill.mir
index cda6ecd46507..02638a1d7c5f 100644
--- a/llvm/test/CodeGen/AMDGPU/undefined-physreg-sgpr-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/undefined-physreg-sgpr-spill.mir
@@ -44,7 +44,7 @@ body:             |
     liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr13
 
     $vgpr1_vgpr2 = COPY killed $sgpr4_sgpr5, implicit $exec
-    $vgpr1 = GLOBAL_LOAD_UBYTE killed $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec :: (non-temporal dereferenceable invariant load 1 from `i1 addrspace(4)* undef`)
+    $vgpr1 = GLOBAL_LOAD_UBYTE killed $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec :: (non-temporal dereferenceable invariant load 1 from `i1 addrspace(4)* undef`)
     $vcc = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec
     $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 1, killed $vgpr1, implicit $exec
     $vgpr1 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed $sgpr0_sgpr1, implicit $exec
@@ -109,7 +109,7 @@ body:             |
     liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr13
 
     $vgpr1_vgpr2 = COPY killed $sgpr4_sgpr5, implicit $exec
-    $vgpr1 = GLOBAL_LOAD_UBYTE killed $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec :: (non-temporal dereferenceable invariant load 1 from `i1 addrspace(4)* undef`)
+    $vgpr1 = GLOBAL_LOAD_UBYTE killed $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec :: (non-temporal dereferenceable invariant load 1 from `i1 addrspace(4)* undef`)
     $vcc = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec
     $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 1, killed $vgpr1, implicit $exec
     $vgpr1 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed $sgpr0_sgpr1, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
new file mode 100644
index 000000000000..9560d1c92722
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
@@ -0,0 +1,80 @@
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX900 %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX90A %s
+
+# GCN-LABEL: name: v_mov_b64_from_vgpr
+# GFX900: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1
+# GFX900: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1
+# GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec
+name: v_mov_b64_from_vgpr
+body: |
+  bb.0:
+    $vgpr0_vgpr1 = V_MOV_B64_PSEUDO $vgpr2_vgpr3, implicit $exec
+...
+
+# GCN-LABEL: name: v_mov_b64_from_sgpr
+# GFX900: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1
+# GFX900: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit-def $vgpr0_vgpr1
+# GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr2_sgpr3, 12, $sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+name: v_mov_b64_from_sgpr
+body: |
+  bb.0:
+    $vgpr0_vgpr1 = V_MOV_B64_PSEUDO $sgpr2_sgpr3, implicit $exec
+...
+
+# GCN-LABEL: name: v_mov_b64_from_sext_inline_imm
+# GFX900: $vgpr0 = V_MOV_B32_e32 4294967294, implicit $exec, implicit-def $vgpr0_vgpr1
+# GFX900: $vgpr1 = V_MOV_B32_e32 4294967295, implicit $exec, implicit-def $vgpr0_vgpr1
+# GFX90A: $vgpr0 = V_MOV_B32_e32 4294967294, implicit $exec, implicit-def $vgpr0_vgpr1
+# GFX90A: $vgpr1 = V_MOV_B32_e32 4294967295, implicit $exec, implicit-def $vgpr0_vgpr1
+name: v_mov_b64_from_sext_inline_imm
+body: |
+  bb.0:
+    $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 18446744073709551614, implicit $exec
+...
+
+# GCN-LABEL: name: v_mov_b64_from_lit
+# GCN: $vgpr0 = V_MOV_B32_e32 1430494974, implicit $exec, implicit-def $vgpr0_vgpr1
+# GCN: $vgpr1 = V_MOV_B32_e32 4294734465, implicit $exec, implicit-def $vgpr0_vgpr1
+name: v_mov_b64_from_lit
+body: |
+  bb.0:
+    $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 18445744073609551614, implicit $exec
+...
+
+# GCN-LABEL: name: v_mov_b64_from_first_inline_imm
+# GCN: $vgpr0 = V_MOV_B32_e32 4294967295, implicit $exec, implicit-def $vgpr0_vgpr1
+# GCN: $vgpr1 = V_MOV_B32_e32 268435455, implicit $exec, implicit-def $vgpr0_vgpr1
+name: v_mov_b64_from_first_inline_imm
+body: |
+  bb.0:
+    $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 1152921504606846975, implicit $exec
+...
+
+# GCN-LABEL: name: v_mov_b64_from_second_inline_imm
+# GCN: $vgpr0 = V_MOV_B32_e32 268435455, implicit $exec, implicit-def $vgpr0_vgpr1
+# GCN: $vgpr1 = V_MOV_B32_e32 4294967295, implicit $exec, implicit-def $vgpr0_vgpr1
+name: v_mov_b64_from_second_inline_imm
+body: |
+  bb.0:
+    $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 18446744069683019775, implicit $exec
+...
+
+# GCN-LABEL: name: v_mov_b64_from_same_sext_inline_imm
+# GFX900: $vgpr0 = V_MOV_B32_e32 4294967295, implicit $exec, implicit-def $vgpr0_vgpr1
+# GFX900: $vgpr1 = V_MOV_B32_e32 4294967295, implicit $exec, implicit-def $vgpr0_vgpr1
+# GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, -1, 8, -1, 0, 0, 0, 0, 0, implicit $exec
+name: v_mov_b64_from_same_sext_inline_imm
+body: |
+  bb.0:
+    $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 18446744073709551615, implicit $exec
+...
+
+# GCN-LABEL: name: v_mov_b64_from_same_fp_inline_imm
+# GFX900: $vgpr0 = V_MOV_B32_e32 1065353216, implicit $exec, implicit-def $vgpr0_vgpr1
+# GFX900: $vgpr1 = V_MOV_B32_e32 1065353216, implicit $exec, implicit-def $vgpr0_vgpr1
+# GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, 1065353216, 8, 1065353216, 0, 0, 0, 0, 0, implicit $exec
+name: v_mov_b64_from_same_fp_inline_imm
+body: |
+  bb.0:
+    $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 4575657222473777152, implicit $exec
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
index a4b62d568138..a5525ee15fbe 100644
--- a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
+++ b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
@@ -24,7 +24,7 @@ body: |
     liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
 
     $vgpr0 = V_MOV_B32_e32 9, implicit $exec
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     S_BRANCH %bb.3
 
@@ -32,7 +32,7 @@ body: |
     liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
 
     $vgpr0 = V_MOV_B32_e32 100, implicit $exec
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr0 = V_MOV_B32_e32 1, implicit $exec
 
   bb.3:
@@ -40,7 +40,7 @@ body: |
 
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -65,7 +65,7 @@ body: |
     liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
 
     $vgpr0 = V_MOV_B32_e32 9, implicit $exec
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     S_BRANCH %bb.3
 
@@ -73,7 +73,7 @@ body: |
     liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
 
     $vgpr0 = V_MOV_B32_e32 100, implicit $exec
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr0 = V_MOV_B32_e32 1, implicit $exec
 
   bb.3:
@@ -81,7 +81,7 @@ body: |
 
     $sgpr3 = S_MOV_B32 61440
     $sgpr2 = S_MOV_B32 -1
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -110,9 +110,9 @@ body: |
 # instructions to fix vccz.
 
 # CHECK-LABEL: name: reload_vcc_from_mem
-# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec
+# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, 0, implicit $exec
 # CHECK: $vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
-# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec
+# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, 0, implicit $exec
 # CHECK: $vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
 # SI:    $vcc = S_MOV_B64 $vcc
 # GFX9:  $vcc = S_MOV_B64 $vcc
@@ -121,9 +121,9 @@ body: |
 name: reload_vcc_from_mem
 body: |
   bb.0:
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, 0, implicit $exec
     $vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, 0, implicit $exec
     $vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
     S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
   bb.1:

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir
index c356e666ab14..b629e98c27cd 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir
@@ -16,7 +16,7 @@ body:             |
 
     ; CHECK-LABEL: name: spill_v32
     ; CHECK: liveins: $vgpr0
-    ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
+    ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
     ; CHECK: S_NOP 0, implicit $vgpr0
     SI_SPILL_V32_SAVE $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
     S_NOP 0, implicit $vgpr0
@@ -37,7 +37,7 @@ body:             |
 
     ; CHECK-LABEL: name: spill_v32_kill
     ; CHECK: liveins: $vgpr0
-    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
+    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
     SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
 ...
 
@@ -56,8 +56,8 @@ body:             |
 
     ; CHECK-LABEL: name: spill_v64
     ; CHECK: liveins: $vgpr0_vgpr1
-    ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5)
-    ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5)
+    ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
     ; CHECK: S_NOP 0, implicit $vgpr0_vgpr1
     SI_SPILL_V64_SAVE $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5)
     S_NOP 0, implicit $vgpr0_vgpr1
@@ -78,8 +78,8 @@ body:             |
 
     ; CHECK-LABEL: name: spill_v64_kill
     ; CHECK: liveins: $vgpr0_vgpr1
-    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5)
-    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5)
+    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
     SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5)
 ...
 
@@ -100,8 +100,8 @@ body:             |
 
     ; CHECK-LABEL: name: spill_v64_undef_sub1_killed
     ; CHECK: liveins: $vgpr0
-    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5)
-    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5)
+    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
     SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5)
 ...
 
@@ -120,8 +120,8 @@ body:             |
 
     ; CHECK-LABEL: name: spill_v64_undef_sub0_killed
     ; CHECK: liveins: $vgpr1
-    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5)
-    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5)
+    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5)
     SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5)
 ...
 
@@ -140,9 +140,9 @@ body:             |
 
     ; CHECK-LABEL: name: spill_v128_kill
     ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
-    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0, addrspace 5)
-    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 4, addrspace 5)
-    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 8, addrspace 5)
-    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5)
+    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0, addrspace 5)
+    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 4, addrspace 5)
+    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 8, addrspace 5)
+    ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5)
     SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, addrspace 5)
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir
index d1766d1c0f9d..8db8ee3db357 100644
--- a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir
@@ -29,7 +29,7 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: name: undef_identity_copy
-    ; CHECK: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = FLAT_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, addrspace 1)
+    ; CHECK: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = FLAT_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, addrspace 1)
     ; CHECK: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead $scc
     ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95
     ; CHECK: $sgpr4 = COPY $sgpr95
@@ -44,9 +44,9 @@ body:             |
     ; CHECK: $vgpr3 = KILL undef renamable $vgpr3
     ; CHECK: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @bar, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0, implicit killed $vgpr1, implicit killed $vgpr2, implicit killed $vgpr3, implicit-def $vgpr0
     ; CHECK: ADJCALLSTACKDOWN 0, 4, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95
-    ; CHECK: FLAT_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; CHECK: FLAT_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; CHECK: S_ENDPGM 0
-    %0:vreg_128 = FLAT_LOAD_DWORDX4 undef %1:vreg_64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, addrspace 1)
+    %0:vreg_128 = FLAT_LOAD_DWORDX4 undef %1:vreg_64, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, addrspace 1)
     %2:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead $scc
     ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95, implicit-def $scc
     $sgpr4 = COPY $sgpr95
@@ -62,7 +62,7 @@ body:             |
     dead $sgpr30_sgpr31 = SI_CALL %3, @bar, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0, implicit killed $vgpr1, implicit killed $vgpr2, implicit killed $vgpr3, implicit-def $vgpr0
     %5:vgpr_32 = COPY $vgpr0
     ADJCALLSTACKDOWN 0, 4, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95
-    FLAT_STORE_DWORD undef %6:vreg_64, %5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    FLAT_STORE_DWORD undef %6:vreg_64, %5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir b/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir
index e730b5d7aa41..ccd9b1af043b 100644
--- a/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir
@@ -11,7 +11,7 @@ body:             |
     $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
     $sgpr4 = IMPLICIT_DEF
     $vgpr0 = IMPLICIT_DEF
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $sgpr0 = S_MOV_B32 0
 ...
 # GCN-LABEL: name: vmem_write_exec
@@ -26,7 +26,7 @@ body:             |
     $sgpr4 = IMPLICIT_DEF
     $vgpr0 = IMPLICIT_DEF
     $vgpr1 = IMPLICIT_DEF
-    BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr0, renamable $vgpr1, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr0, renamable $vgpr1, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $exec_lo = S_MOV_B32 -1
 ...
 # GCN-LABEL: name: vmem_write_sgpr_chain
@@ -45,7 +45,7 @@ body:             |
     $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
     $sgpr4 = IMPLICIT_DEF
     $vgpr0 = IMPLICIT_DEF
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $sgpr5 = S_MOV_B32 $sgpr0
     $sgpr6 = S_MOV_B32 $sgpr1
     $sgpr7 = S_MOV_B32 $sgpr2
@@ -63,7 +63,7 @@ body:             |
     $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
     $sgpr4 = IMPLICIT_DEF
     $vgpr0 = IMPLICIT_DEF
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0
 ...
 # GCN-LABEL: name: vmem_snop_write_sgpr
@@ -78,7 +78,7 @@ body:             |
     $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
     $sgpr4 = IMPLICIT_DEF
     $vgpr0 = IMPLICIT_DEF
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_NOP 0
     $sgpr0 = S_MOV_B32 0
 ...
@@ -93,7 +93,7 @@ body:             |
     $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
     $sgpr4 = IMPLICIT_DEF
     $vgpr0 = IMPLICIT_DEF
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     $sgpr0 = S_MOV_B32 0
 ...
@@ -108,7 +108,7 @@ body:             |
     $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
     $sgpr4 = IMPLICIT_DEF
     $vgpr0 = IMPLICIT_DEF
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_WAITCNT 0
     $sgpr0 = S_MOV_B32 0
 ...
@@ -124,7 +124,7 @@ body:             |
     $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
     $sgpr4 = IMPLICIT_DEF
     $vgpr0 = IMPLICIT_DEF
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_WAITCNT 1
     $sgpr0 = S_MOV_B32 0
 ...
@@ -139,7 +139,7 @@ body:             |
     $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
     $sgpr4 = IMPLICIT_DEF
     $vgpr0 = IMPLICIT_DEF
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $exec = S_MOV_B64 7
 ...
 # GCN-LABEL: name: vmem_write_exec_expread
@@ -152,7 +152,7 @@ body:             |
   bb.0:
     $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
     $vgpr0 = IMPLICIT_DEF
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $exec_lo, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $exec_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $exec = S_MOV_B64 7
 ...
 # GCN-LABEL: name: ds_write_m0
@@ -181,7 +181,7 @@ body:             |
     $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
     $sgpr4 = IMPLICIT_DEF
     $vgpr0 = IMPLICIT_DEF
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
   bb.1:
     $sgpr0 = S_MOV_B32 0
@@ -199,7 +199,7 @@ body:             |
     $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
     $sgpr4 = IMPLICIT_DEF
     $vgpr0 = IMPLICIT_DEF
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
@@ -219,7 +219,7 @@ body:             |
     $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
     $sgpr4 = IMPLICIT_DEF
     $vgpr0 = IMPLICIT_DEF
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_BRANCH %bb.2
 
   bb.1:
@@ -247,7 +247,7 @@ body:             |
     $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
     $sgpr4 = IMPLICIT_DEF
     $vgpr0 = IMPLICIT_DEF
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_CBRANCH_SCC0 %bb.2, implicit $scc
     S_BRANCH %bb.1
 
@@ -275,7 +275,7 @@ body:             |
     $sgpr0 = S_MOV_B32 0
 
   bb.1:
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_BRANCH %bb.0
 ...
 # GCN-LABEL: name: ds_write_exec
@@ -300,7 +300,7 @@ name:            vmem_scratch_exec
 body:             |
   bb.0:
     $vgpr0 = IMPLICIT_DEF
-    $vgpr1 = SCRATCH_LOAD_DWORD undef $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr1 = SCRATCH_LOAD_DWORD undef $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     $exec_lo = S_MOV_B32 -1
 ...
 # GCN-LABEL: name: vmem_flat_exec
@@ -313,7 +313,7 @@ body:             |
   bb.0:
     $vgpr0 = IMPLICIT_DEF
     $vgpr1 = IMPLICIT_DEF
-    $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     $exec_lo = S_MOV_B32 -1
 ...
 # GCN-LABEL: name: vmem_global_exec
@@ -326,7 +326,7 @@ body:             |
   bb.0:
     $vgpr0 = IMPLICIT_DEF
     $vgpr1 = IMPLICIT_DEF
-    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec
+    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec
     $exec_lo = S_MOV_B32 -1
 ...
 # GCN-LABEL: name: vmem_global_atomic_exec
@@ -340,6 +340,6 @@ body:             |
     $vgpr0 = IMPLICIT_DEF
     $vgpr1 = IMPLICIT_DEF
     $vgpr2 = IMPLICIT_DEF
-    $vgpr3 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 0, 1, 0, implicit $exec :: (load store syncscope("agent") seq_cst 4, addrspace 1)
+    $vgpr3 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 0, 1, 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst 4, addrspace 1)
     $exec_lo = S_MOV_B32 -1
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir b/llvm/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir
index 2efbe582fbb3..8c05c7041aa4 100644
--- a/llvm/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir
@@ -15,7 +15,7 @@ body:             |
     $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
 
   bb.1:
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 ...
 # GCN-LABEL: name: vmem_vcc_branch_to_next
 # GCN:      bb.1:
@@ -33,7 +33,7 @@ body:             |
     S_BRANCH %bb.1
 
   bb.1:
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 ...
 # GCN-LABEL: name: vmem_vcc_fallthrough_no_hazard_too_far
 # GCN:      bb.1:
@@ -54,7 +54,7 @@ body:             |
     $sgpr0 = S_MOV_B32 0
 
   bb.1:
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 ...
 # GCN-LABEL: name: vmem_vcc_fallthrough_no_hazard_nops
 # GCN:      bb.1:
@@ -71,7 +71,7 @@ body:             |
     S_NOP 4
 
   bb.1:
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 ...
 # GCN-LABEL: name: vmem_vcc_branch_around
 # GCN:      bb.2:
@@ -97,7 +97,7 @@ body:             |
     S_NOP 0
 
   bb.2:
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 ...
 # GCN-LABEL: name: vmem_vcc_branch_backedge
 # GCN:      S_NOP 3
@@ -110,7 +110,7 @@ body:             |
 
     $vgpr0 = IMPLICIT_DEF
     $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
   bb.1:
     $vgpr0 = IMPLICIT_DEF
@@ -139,7 +139,7 @@ body:             |
     $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
 
   bb.2:
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 ...
 # GCN-LABEL: name: vmem_vcc_self_loop
 # GCN:      S_NOP 3
@@ -152,7 +152,7 @@ body:             |
 
     $vgpr0 = IMPLICIT_DEF
     $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
     S_BRANCH %bb.0
 ...
@@ -175,7 +175,7 @@ body:             |
     successors: %bb.1
 
     $sgpr0 = S_MOV_B32 0
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
     S_BRANCH %bb.1
 ...
@@ -199,7 +199,7 @@ body:             |
     successors: %bb.1
 
     $sgpr0 = S_MOV_B32 0
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
     S_BRANCH %bb.1
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/waitcnt-agpr.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-agpr.mir
new file mode 100644
index 000000000000..0b2f02e79f37
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-agpr.mir
@@ -0,0 +1,316 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass si-insert-waitcnts -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+--- |
+  define amdgpu_kernel void @flat_zero_waitcnt(i32 addrspace(1)* %global4,
+                                 <4 x i32> addrspace(1)* %global16,
+                                 i32* %flat4,
+                                 <4 x i32>* %flat16) {
+    ret void
+  }
+
+  define amdgpu_kernel void @single_fallthrough_successor_no_end_block_wait() {
+    ret void
+  }
+
+  define amdgpu_kernel void @single_branch_successor_not_next_block() {
+    ret void
+  }
+
+  define amdgpu_kernel void @preexisting_waitcnt() {
+    ret void
+  }
+
+  define amdgpu_kernel void @bundle_no_waitcnt() {
+    ret void
+  }
+
+  define amdgpu_kernel void @preexisting_waitcnt_in_bundle() {
+    ret void
+  }
+
+  define amdgpu_kernel void @insert_in_bundle() {
+    ret void
+  }
+
+  define amdgpu_kernel void @exit_bundle() {
+    ret void
+  }
+
+  define amdgpu_kernel void @cross_bundle() {
+    ret void
+  }
+
+...
+---
+
+
+# Global loads will return in order so we should:
+# s_waitcnt vmcnt(1)
+
+# s_waitcnt vmcnt(0)
+
+# s_waitcnt vmcnt(0)
+
+name: flat_zero_waitcnt
+
+body: |
+  ; GCN-LABEL: name: flat_zero_waitcnt
+  ; GCN: bb.0:
+  ; GCN:   successors: %bb.1(0x80000000)
+  ; GCN:   S_WAITCNT 0
+  ; GCN:   $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.global4, addrspace 1)
+  ; GCN:   $agpr3_agpr4_agpr5_agpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16, addrspace 1)
+  ; GCN:   S_WAITCNT 3953
+  ; GCN:   $agpr0 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec
+  ; GCN:   S_BRANCH %bb.1
+  ; GCN: bb.1:
+  ; GCN:   successors: %bb.2(0x80000000)
+  ; GCN:   $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN:   S_WAITCNT 3952
+  ; GCN:   $agpr3_agpr4_agpr5_agpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16, addrspace 1)
+  ; GCN:   $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
+  ; GCN:   S_BRANCH %bb.2
+  ; GCN: bb.2:
+  ; GCN:   S_WAITCNT 49279
+  ; GCN:   $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4)
+  ; GCN:   S_WAITCNT 3952
+  ; GCN:   $agpr3_agpr4_agpr5_agpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.flat16)
+  ; GCN:   $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
+  ; GCN:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
+    $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.global4)
+    $agpr3_agpr4_agpr5_agpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16)
+    $agpr0 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2
+    $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $agpr3_agpr4_agpr5_agpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16)
+    $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.2:
+    $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4)
+    $agpr3_agpr4_agpr5_agpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.flat16)
+    $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
+    S_ENDPGM 0
+...
+---
+# There is only a single fallthrough successor block, so there's no
+# need to wait immediately.
+
+
+name: single_fallthrough_successor_no_end_block_wait
+
+body: |
+  ; GCN-LABEL: name: single_fallthrough_successor_no_end_block_wait
+  ; GCN: bb.0:
+  ; GCN:   successors: %bb.1(0x80000000)
+  ; GCN:   S_WAITCNT 0
+  ; GCN:   $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN: bb.1:
+  ; GCN:   $vgpr3_vgpr4 = V_LSHLREV_B64_e64 4, $vgpr7_vgpr8, implicit $exec
+  ; GCN:   S_WAITCNT 112
+  ; GCN:   FLAT_STORE_DWORD $vgpr3_vgpr4, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
+    $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+
+  bb.1:
+    $vgpr3_vgpr4 = V_LSHLREV_B64_e64 4, $vgpr7_vgpr8, implicit $exec
+    FLAT_STORE_DWORD $vgpr3_vgpr4, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    S_ENDPGM 0
+...
+---
+# The block has a single predecessor with a single successor, but it
+# is not the next block so it's non-obvious that the wait is not needed.
+
+
+
+
+name: single_branch_successor_not_next_block
+
+body: |
+  ; GCN-LABEL: name: single_branch_successor_not_next_block
+  ; GCN: bb.0:
+  ; GCN:   successors: %bb.2(0x80000000)
+  ; GCN:   S_WAITCNT 0
+  ; GCN:   $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN:   S_BRANCH %bb.2
+  ; GCN: bb.1:
+  ; GCN:   FLAT_STORE_DWORD $vgpr8_vgpr9, $agpr10, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN:   S_ENDPGM 0
+  ; GCN: bb.2:
+  ; GCN:   $vgpr3_vgpr4 = V_LSHLREV_B64_e64 4, $vgpr7_vgpr8, implicit $exec
+  ; GCN:   S_WAITCNT 112
+  ; GCN:   FLAT_STORE_DWORD $vgpr3_vgpr4, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.2
+    $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+   S_BRANCH %bb.2
+
+  bb.1:
+    FLAT_STORE_DWORD $vgpr8_vgpr9, $agpr10, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    S_ENDPGM 0
+
+  bb.2:
+    $vgpr3_vgpr4 = V_LSHLREV_B64_e64 4, $vgpr7_vgpr8, implicit $exec
+    FLAT_STORE_DWORD $vgpr3_vgpr4, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    S_ENDPGM 0
+...
+
+# GCN-LABEL: name: preexisting_waitcnt{{$}}
+# GCN: FLAT_LOAD_DWORD
+# GCN-NEXT: S_WAITCNT 0
+# GCN-NOT: S_WAITCNT
+name: preexisting_waitcnt
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr1_vgpr2
+    $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    S_WAITCNT 0
+    FLAT_STORE_DWORD $vgpr1_vgpr2, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+
+...
+
+---
+
+name: bundle_no_waitcnt
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr1_vgpr2
+    ; GCN-LABEL: name: bundle_no_waitcnt
+    ; GCN: liveins: $vgpr1_vgpr2
+    ; GCN: $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: BUNDLE {
+    ; GCN:   S_NOP 0
+    ; GCN:   S_NOP 0
+    ; GCN: }
+    ; GCN: S_WAITCNT 112
+    ; GCN: FLAT_STORE_DWORD $vgpr1_vgpr2, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    BUNDLE {
+      S_NOP 0
+      S_NOP 0
+    }
+    FLAT_STORE_DWORD $vgpr1_vgpr2, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+
+...
+
+---
+
+# See the waitcnt inside the bundle and don't insert an extra
+name: preexisting_waitcnt_in_bundle
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr1_vgpr2
+    ; GCN-LABEL: name: preexisting_waitcnt_in_bundle
+    ; GCN: liveins: $vgpr1_vgpr2
+    ; GCN: $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: BUNDLE {
+    ; GCN:   S_NOP 0
+    ; GCN:   S_WAITCNT 0
+    ; GCN: }
+    ; GCN: FLAT_STORE_DWORD $vgpr1_vgpr2, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    BUNDLE {
+      S_NOP 0
+      S_WAITCNT 0
+    }
+    FLAT_STORE_DWORD $vgpr1_vgpr2, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+
+...
+
+---
+
+# Def and use inside bundle
+
+name: insert_in_bundle
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr1_vgpr2
+    ; GCN-LABEL: name: insert_in_bundle
+    ; GCN: liveins: $vgpr1_vgpr2
+    ; GCN: BUNDLE implicit-def $agpr0, implicit $vgpr1_vgpr2 {
+    ; GCN:   $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN:   S_WAITCNT 112
+    ; GCN:   FLAT_STORE_DWORD $vgpr1_vgpr2, internal $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: }
+    BUNDLE implicit-def $agpr0, implicit $vgpr1_vgpr2 {
+    $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD $vgpr1_vgpr2, internal $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    }
+...
+
+---
+
+# Def is last instruction in bundle, use is outside bundle
+
+
+name: exit_bundle
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr1_vgpr2
+    ; GCN-LABEL: name: exit_bundle
+    ; GCN: liveins: $vgpr1_vgpr2
+    ; GCN: BUNDLE implicit-def $agpr0, implicit $vgpr1_vgpr2 {
+    ; GCN:   $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: }
+    ; GCN: S_WAITCNT 112
+    ; GCN: FLAT_STORE_DWORD $vgpr1_vgpr2, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    BUNDLE implicit-def $agpr0, implicit $vgpr1_vgpr2 {
+    $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    }
+
+    FLAT_STORE_DWORD $vgpr1_vgpr2, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+
+...
+
+---
+
+# Def is in bundle, use is in another bundle
+
+
+name: cross_bundle
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr1_vgpr2
+    ; GCN-LABEL: name: cross_bundle
+    ; GCN: liveins: $vgpr1_vgpr2
+    ; GCN: BUNDLE implicit-def $agpr0, implicit $vgpr1_vgpr2 {
+    ; GCN:   $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: }
+    ; GCN: S_WAITCNT 112
+    ; GCN: BUNDLE implicit $agpr0, implicit $vgpr1_vgpr2 {
+    ; GCN:   FLAT_STORE_DWORD $vgpr1_vgpr2, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN: }
+    BUNDLE implicit-def $agpr0, implicit $vgpr1_vgpr2 {
+    $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    }
+    BUNDLE implicit $agpr0, implicit $vgpr1_vgpr2 {
+      FLAT_STORE_DWORD $vgpr1_vgpr2, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    }
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir
index e6abdba43642..795f0974f480 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir
@@ -13,8 +13,8 @@ body:             |
 
     $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1_vgpr2
     $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1_vgpr2
-    $vgpr4 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1)
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1)
+    $vgpr4 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1)
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1)
     $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 3, killed $sgpr4, implicit $exec
     $vgpr3 = V_CNDMASK_B32_e64 0, -1082130432, 0, 1065353216, killed $sgpr0_sgpr1, implicit $exec
     $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec
@@ -23,7 +23,7 @@ body:             |
   bb.3:
     successors: %bb.1
 
-    $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1)
+    $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1)
 
   bb.1:
     successors: %bb.5, %bb.2
@@ -43,7 +43,7 @@ body:             |
   bb.4:
     successors: %bb.3, %bb.1
 
-    $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1)
+    $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1)
     $vgpr4 = V_CVT_I32_F32_e32 $vgpr5, implicit $mode, implicit $exec
     V_CMP_EQ_U32_e32 2, killed $vgpr4, implicit-def $vcc, implicit $exec
     $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc

diff  --git a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir
index f2f6462ad48c..ee9978abe848 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir
@@ -78,7 +78,7 @@ body: |
 
   bb.1:
     successors: %bb.2
-    BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr3, renamable $vgpr2, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr3, renamable $vgpr2, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
   bb.2:
     successors: %bb.3, %bb.6
@@ -86,7 +86,7 @@ body: |
 
   bb.3:
     successors: %bb.4, %bb.5
-    BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr3, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr3, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_CBRANCH_VCCNZ %bb.5, implicit $vcc
 
   bb.4:
@@ -98,6 +98,6 @@ body: |
     successors: %bb.6
 
   bb.6:
-    FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-single-basic-block.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-single-basic-block.mir
index 19158b6ec254..5e261bfa352c 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-single-basic-block.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-single-basic-block.mir
@@ -15,11 +15,11 @@ body: |
   bb.0:
     S_BRANCH %bb.1
   bb.1:
-    GLOBAL_STORE_DWORD $vgpr7_vgpr8, $vgpr11, 0, 0, 0, 0, implicit $exec
-    $vgpr21 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec
-    $vgpr10 = GLOBAL_LOAD_DWORD $vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD $vgpr14_vgpr15, $vgpr21, 0, 0, 0, 0, implicit $exec
-    $vgpr11 = GLOBAL_LOAD_DWORD $vgpr11_vgpr12, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr7_vgpr8, $vgpr11, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr21 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr10 = GLOBAL_LOAD_DWORD $vgpr10_vgpr11, 0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr14_vgpr15, $vgpr21, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr11 = GLOBAL_LOAD_DWORD $vgpr11_vgpr12, 0, 0, 0, 0, 0, implicit $exec
     S_CBRANCH_SCC1 %bb.1, implicit $scc
   bb.2:
     S_ENDPGM 0

diff  --git a/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir
index 4905bcc06c62..63dcb67cd706 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir
@@ -12,9 +12,9 @@ body:             |
     liveins: $vgpr0_vgpr1
     ; GCN-LABEL: name: waitcnt_kill
     ; GCN: S_WAITCNT 0
-    ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: KILL $vgpr0
-    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec
     KILL $vgpr0
 ...
 
@@ -27,9 +27,9 @@ body:             |
     liveins: $vgpr0_vgpr1
     ; GCN-LABEL: name: waitcnt_implicit_def
     ; GCN: S_WAITCNT 0
-    ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: $vgpr0 = IMPLICIT_DEF
-    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec
     $vgpr0 = IMPLICIT_DEF
 ...
 
@@ -42,9 +42,9 @@ body:             |
     liveins: $vgpr0_vgpr1, $vgpr2
     ; GCN-LABEL: name: waitcnt_eh_label
     ; GCN: S_WAITCNT 0
-    ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: EH_LABEL <mcsymbol Ltmp0>, implicit $vgpr0
-    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec
     EH_LABEL <mcsymbol Ltmp0>, implicit $vgpr0
 
 ...
@@ -58,9 +58,9 @@ body:             |
     liveins: $vgpr0_vgpr1, $vgpr2
     ; GCN-LABEL: name: waitcnt_cfi
     ; GCN: S_WAITCNT 0
-    ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: CFI_INSTRUCTION offset $vgpr0_lo16, 16
-    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec
     CFI_INSTRUCTION offset $vgpr0, 16
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir
index 92e73f6db39d..ecaf9cd43cba 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir
@@ -17,7 +17,7 @@ body: |
 
   bb.1:
     S_WAITCNT 3952
-    FLAT_ATOMIC_CMPSWAP undef renamable $vgpr0_vgpr1, renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_ATOMIC_CMPSWAP undef renamable $vgpr0_vgpr1, renamable $vgpr0_vgpr1, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_WAITCNT 3952
     BUFFER_WBINVL1 implicit $exec
     S_BRANCH %bb.1

diff  --git a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir
index f61056917c82..0fcdfc6c2da6 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX9,GFX9_10 %s
-# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX10,GFX9_10 %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX9 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX10 %s
 
 --- |
     define amdgpu_kernel void @max-counter-lgkmcnt() #0 { ret void }
@@ -21,25 +21,60 @@ body:             |
 
     ; GFX9-LABEL: name: max-counter-lgkmcnt
     ; GFX9: S_WAITCNT 0
+    ; GFX9: $vgpr0_vgpr1 = DS_READ2_B32_gfx9 renamable $vgpr99, 0, 1, 0, implicit $exec
+    ; GFX9: $vgpr2_vgpr3 = DS_READ2_B32_gfx9 renamable $vgpr99, 2, 3, 0, implicit $exec
+    ; GFX9: $vgpr4_vgpr5 = DS_READ2_B32_gfx9 renamable $vgpr99, 4, 5, 0, implicit $exec
+    ; GFX9: $vgpr6_vgpr7 = DS_READ2_B32_gfx9 renamable $vgpr99, 6, 7, 0, implicit $exec
+    ; GFX9: $vgpr8_vgpr9 = DS_READ2_B32_gfx9 renamable $vgpr99, 8, 9, 0, implicit $exec
+    ; GFX9: $vgpr10_vgpr11 = DS_READ2_B32_gfx9 renamable $vgpr99, 10, 11, 0, implicit $exec
+    ; GFX9: $vgpr12_vgpr13 = DS_READ2_B32_gfx9 renamable $vgpr99, 12, 13, 0, implicit $exec
+    ; GFX9: $vgpr14_vgpr15 = DS_READ2_B32_gfx9 renamable $vgpr99, 14, 15, 0, implicit $exec
+    ; GFX9: $vgpr16_vgpr17 = DS_READ2_B32_gfx9 renamable $vgpr99, 16, 17, 0, implicit $exec
+    ; GFX9: $vgpr18_vgpr19 = DS_READ2_B32_gfx9 renamable $vgpr99, 18, 19, 0, implicit $exec
+    ; GFX9: $vgpr20_vgpr21 = DS_READ2_B32_gfx9 renamable $vgpr99, 20, 21, 0, implicit $exec
+    ; GFX9: $vgpr22_vgpr23 = DS_READ2_B32_gfx9 renamable $vgpr99, 22, 23, 0, implicit $exec
+    ; GFX9: $vgpr24_vgpr25 = DS_READ2_B32_gfx9 renamable $vgpr99, 24, 25, 0, implicit $exec
+    ; GFX9: $vgpr26_vgpr27 = DS_READ2_B32_gfx9 renamable $vgpr99, 26, 27, 0, implicit $exec
+    ; GFX9: $vgpr28_vgpr29 = DS_READ2_B32_gfx9 renamable $vgpr99, 28, 29, 0, implicit $exec
+    ; GFX9: $vgpr30_vgpr31 = DS_READ2_B32_gfx9 renamable $vgpr99, 30, 31, 0, implicit $exec
+    ; GFX9: $vgpr32_vgpr33 = DS_READ2_B32_gfx9 renamable $vgpr99, 32, 33, 0, implicit $exec
     ; GFX9: $vgpr34_vgpr35 = DS_READ2_B32_gfx9 renamable $vgpr99, 34, 35, 0, implicit $exec
-    ; GFX9-NOT: S_WAITCNT 53119
-    ; GFX9-NEXT: S_WAITCNT 52863
-    ; GFX9-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    ; GFX9-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec
-    ; GFX9-NEXT: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec
-    ; GFX9-NEXT: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec
-    ; GFX9-NEXT: S_ENDPGM 0
+    ; GFX9: S_WAITCNT 52863
+    ; GFX9: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; GFX9: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec
+    ; GFX9: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec
+    ; GFX9: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec
+    ; GFX9: S_ENDPGM 0
     ; GFX10-LABEL: name: max-counter-lgkmcnt
+    ; GFX10: S_WAITCNT 0
+    ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
+    ; GFX10: $vgpr0_vgpr1 = DS_READ2_B32_gfx9 renamable $vgpr99, 0, 1, 0, implicit $exec
+    ; GFX10: $vgpr2_vgpr3 = DS_READ2_B32_gfx9 renamable $vgpr99, 2, 3, 0, implicit $exec
+    ; GFX10: $vgpr4_vgpr5 = DS_READ2_B32_gfx9 renamable $vgpr99, 4, 5, 0, implicit $exec
+    ; GFX10: $vgpr6_vgpr7 = DS_READ2_B32_gfx9 renamable $vgpr99, 6, 7, 0, implicit $exec
+    ; GFX10: $vgpr8_vgpr9 = DS_READ2_B32_gfx9 renamable $vgpr99, 8, 9, 0, implicit $exec
+    ; GFX10: $vgpr10_vgpr11 = DS_READ2_B32_gfx9 renamable $vgpr99, 10, 11, 0, implicit $exec
+    ; GFX10: $vgpr12_vgpr13 = DS_READ2_B32_gfx9 renamable $vgpr99, 12, 13, 0, implicit $exec
+    ; GFX10: $vgpr14_vgpr15 = DS_READ2_B32_gfx9 renamable $vgpr99, 14, 15, 0, implicit $exec
+    ; GFX10: $vgpr16_vgpr17 = DS_READ2_B32_gfx9 renamable $vgpr99, 16, 17, 0, implicit $exec
+    ; GFX10: $vgpr18_vgpr19 = DS_READ2_B32_gfx9 renamable $vgpr99, 18, 19, 0, implicit $exec
+    ; GFX10: $vgpr20_vgpr21 = DS_READ2_B32_gfx9 renamable $vgpr99, 20, 21, 0, implicit $exec
+    ; GFX10: $vgpr22_vgpr23 = DS_READ2_B32_gfx9 renamable $vgpr99, 22, 23, 0, implicit $exec
+    ; GFX10: $vgpr24_vgpr25 = DS_READ2_B32_gfx9 renamable $vgpr99, 24, 25, 0, implicit $exec
+    ; GFX10: $vgpr26_vgpr27 = DS_READ2_B32_gfx9 renamable $vgpr99, 26, 27, 0, implicit $exec
+    ; GFX10: $vgpr28_vgpr29 = DS_READ2_B32_gfx9 renamable $vgpr99, 28, 29, 0, implicit $exec
+    ; GFX10: $vgpr30_vgpr31 = DS_READ2_B32_gfx9 renamable $vgpr99, 30, 31, 0, implicit $exec
+    ; GFX10: $vgpr32_vgpr33 = DS_READ2_B32_gfx9 renamable $vgpr99, 32, 33, 0, implicit $exec
     ; GFX10: $vgpr34_vgpr35 = DS_READ2_B32_gfx9 renamable $vgpr99, 34, 35, 0, implicit $exec
-    ; GFX10-NEXT: S_WAITCNT 53631
-    ; GFX10-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    ; GFX10-NEXT: S_WAITCNT 53375
-    ; GFX10-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec
-    ; GFX10-NEXT: S_WAITCNT 53119
-    ; GFX10-NEXT: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec
-    ; GFX10-NEXT: S_WAITCNT 52863
-    ; GFX10-NEXT: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec
-    ; GFX10-NEXT: S_ENDPGM 0
+    ; GFX10: S_WAITCNT 53631
+    ; GFX10: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; GFX10: S_WAITCNT 53375
+    ; GFX10: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec
+    ; GFX10: S_WAITCNT 53119
+    ; GFX10: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec
+    ; GFX10: S_WAITCNT 52863
+    ; GFX10: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec
+    ; GFX10: S_ENDPGM 0
     $vgpr0_vgpr1 = DS_READ2_B32_gfx9 renamable $vgpr99, 0, 1, 0, implicit $exec
     $vgpr2_vgpr3 = DS_READ2_B32_gfx9 renamable $vgpr99, 2, 3, 0, implicit $exec
     $vgpr4_vgpr5 = DS_READ2_B32_gfx9 renamable $vgpr99, 4, 5, 0, implicit $exec
@@ -72,84 +107,224 @@ body:             |
   bb.0:
     liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4
 
-    ; GFX9_10-LABEL: name: max-counter-vmcnt
-    ; GFX9_10: $vgpr66 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 264, 0, 0, 0, 0, 0, implicit $exec
-    ; GFX9-NOT: S_WAITCNT 53119
-    ; GFX10-NOT: S_WAITCNT 65407
-    ; GFX9-NEXT: S_WAITCNT 53118
-    ; GFX10-NEXT: S_WAITCNT 65406
-    ; GFX9_10-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    ; GFX9_10-NEXT: $vgpr1 = V_MAC_F32_e32 0, $vgpr2, $vgpr1, implicit $mode, implicit $exec
-    ; GFX9_10-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec
-    ; GFX9_10-NEXT: $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $mode, implicit $exec
-    ; GFX9_10-NEXT: S_ENDPGM 0
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 20, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 24, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 28, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 32, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 36, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 40, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 44, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 48, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 52, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 56, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 60, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 64, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 68, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 72, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 76, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 80, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 84, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 88, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 92, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 96, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 100, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 104, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 108, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 112, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 116, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 120, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 124, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 128, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr33 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 132, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr34 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 136, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr35 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 140, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr36 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 144, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr37 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 148, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr38 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 152, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr39 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 156, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 160, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 164, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 168, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 172, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 176, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 180, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 184, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 188, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr48 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 192, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr49 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 196, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr50 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 200, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr51 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 204, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr52 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 208, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr53 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 212, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr54 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 216, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr55 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 220, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 224, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 228, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 232, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 236, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 240, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 244, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 248, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 252, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr64 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 256, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr65 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 260, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr66 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 264, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9-LABEL: name: max-counter-vmcnt
+    ; GFX9: S_WAITCNT 0
+    ; GFX9: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 20, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 24, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 28, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 32, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 36, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 40, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 44, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 48, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 52, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 56, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 60, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 64, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 68, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 72, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 76, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 80, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 84, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 88, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 92, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 96, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 100, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 104, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 108, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 112, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 116, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 120, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 124, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 128, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr33 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 132, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr34 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 136, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr35 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 140, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr36 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 144, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr37 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 148, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr38 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 152, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr39 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 156, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 160, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 164, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 168, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 172, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 176, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 180, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 184, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 188, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr48 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 192, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr49 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 196, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr50 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 200, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr51 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 204, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr52 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 208, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr53 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 212, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr54 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 216, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr55 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 220, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 224, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 228, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 232, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 236, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 240, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 244, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 248, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 252, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr64 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 256, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr65 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 260, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr66 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 264, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: S_WAITCNT 53118
+    ; GFX9: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; GFX9: $vgpr1 = V_MAC_F32_e32 0, $vgpr2, $vgpr1, implicit $mode, implicit $exec
+    ; GFX9: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec
+    ; GFX9: $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $mode, implicit $exec
+    ; GFX9: S_ENDPGM 0
+    ; GFX10-LABEL: name: max-counter-vmcnt
+    ; GFX10: S_WAITCNT 0
+    ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
+    ; GFX10: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 20, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 24, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 28, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 32, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 36, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 40, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 44, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 48, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 52, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 56, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 60, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 64, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 68, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 72, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 76, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 80, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 84, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 88, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 92, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 96, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 100, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 104, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 108, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 112, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 116, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 120, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 124, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 128, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr33 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 132, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr34 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 136, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr35 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 140, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr36 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 144, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr37 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 148, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr38 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 152, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr39 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 156, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 160, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 164, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 168, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 172, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 176, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 180, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 184, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 188, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr48 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 192, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr49 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 196, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr50 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 200, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr51 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 204, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr52 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 208, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr53 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 212, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr54 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 216, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr55 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 220, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 224, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 228, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 232, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 236, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 240, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 244, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 248, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 252, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr64 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 256, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr65 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 260, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: $vgpr66 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 264, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX10: S_WAITCNT 65406
+    ; GFX10: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; GFX10: $vgpr1 = V_MAC_F32_e32 0, $vgpr2, $vgpr1, implicit $mode, implicit $exec
+    ; GFX10: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec
+    ; GFX10: $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $mode, implicit $exec
+    ; GFX10: S_ENDPGM 0
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 20, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 24, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 28, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 32, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 36, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 40, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 44, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 48, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 52, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 56, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 60, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 64, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 68, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 72, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 76, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 80, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 84, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 88, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 92, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 96, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 100, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 104, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 108, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 112, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 116, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 120, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 124, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 128, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr33 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 132, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr34 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 136, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr35 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 140, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr36 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 144, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr37 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 148, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr38 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 152, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr39 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 156, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 160, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 164, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 168, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 172, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 176, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 180, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 184, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 188, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr48 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 192, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr49 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 196, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr50 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 200, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr51 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 204, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr52 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 208, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr53 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 212, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr54 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 216, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr55 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 220, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 224, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 228, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 232, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 236, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 240, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 244, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 248, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 252, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr64 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 256, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr65 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 260, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr66 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 264, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec
     $vgpr1 = V_MAC_F32_e32 0, $vgpr2, $vgpr1, implicit $mode, implicit $exec
     $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec
@@ -164,9 +339,31 @@ body:             |
   bb.0:
     liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0, $vgpr1
 
-    ; GFX9_10-LABEL: name: max-counter-expcnt
-    ; GFX9_10: EXP
-    ; GFX9_10-NOT: S_WAITCNT
+    ; GFX9-LABEL: name: max-counter-expcnt
+    ; GFX9: S_WAITCNT 0
+    ; GFX9: EXP 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    ; GFX9: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+    ; GFX9: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+    ; GFX9: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+    ; GFX9: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+    ; GFX9: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+    ; GFX9: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+    ; GFX9: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+    ; GFX9: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; GFX9: S_ENDPGM 0
+    ; GFX10-LABEL: name: max-counter-expcnt
+    ; GFX10: S_WAITCNT 0
+    ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
+    ; GFX10: EXP 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    ; GFX10: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+    ; GFX10: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+    ; GFX10: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+    ; GFX10: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+    ; GFX10: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+    ; GFX10: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+    ; GFX10: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+    ; GFX10: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; GFX10: S_ENDPGM 0
     EXP 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
     EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
     EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir
index e51b6ea24b33..16f527114d7f 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir
@@ -32,6 +32,6 @@ body:             |
     $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec
     $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec
     $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec
-    IMAGE_STORE_V4_V2 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
+    IMAGE_STORE_V4_V2 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
     S_ENDPGM 0
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir
index c72a4a95c87a..f56a913da359 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir
@@ -11,10 +11,10 @@ body: |
     ; GFX9-LABEL: name: buffer_buffer
     ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5
     ; GFX9: S_WAITCNT 0
-    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; GFX9: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 ...
 
 # Two tbuffer loads with overlapping outputs. No waitcnt required.
@@ -27,10 +27,10 @@ body: |
     ; GFX9-LABEL: name: tbuffer_tbuffer
     ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5
     ; GFX9: S_WAITCNT 0
-    ; GFX9: $vgpr0_vgpr1_vgpr2 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec
-    ; GFX9: $vgpr0 = TBUFFER_LOAD_FORMAT_X_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr0_vgpr1_vgpr2 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr0 = TBUFFER_LOAD_FORMAT_X_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr0_vgpr1_vgpr2 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 125, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr0 = TBUFFER_LOAD_FORMAT_X_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0_vgpr1_vgpr2 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 125, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = TBUFFER_LOAD_FORMAT_X_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec
 ...
 
 # Two gathers with overlapping outputs. (Note gathers can't be trimmed because
@@ -44,10 +44,10 @@ body: |
     ; GFX9-LABEL: name: gather_gather
     ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9: S_WAITCNT 0
-    ; GFX9: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; GFX9: $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
-    $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
+    ; GFX9: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
+    ; GFX9: $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
+    $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
+    $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
 ...
 
 # Image load vs image sample. Waitcnt required because they are not guaranteed
@@ -62,9 +62,9 @@ body: |
     ; GFX9-LABEL: name: nosampler_sampler
     ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX9: S_WAITCNT 0
-    ; GFX9: $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
     ; GFX9: S_WAITCNT 3952
-    ; GFX9: $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec
-    $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
-    $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load 16)
+    ; GFX9: $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load 16)
+    $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
+    $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load 16)
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.mir
index f7110bdd8c0d..c2b3e54fbe35 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.mir
@@ -12,6 +12,6 @@ body: |
     liveins: $sgpr0_sgpr1
     $sgpr4 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`)
     S_WAITCNT_VSCNT undef $sgpr_null, 0
-    $vgpr0 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 0, 1, 0, implicit $exec :: (load store syncscope("agent") seq_cst 4, addrspace 1)
+    $vgpr0 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 0, 1, 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst 4, addrspace 1)
     S_CMP_LG_U32 killed $sgpr4, 0, implicit-def $scc
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/waitcnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt.mir
index 9c7e4ac22308..a0cc9464d5ae 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt.mir
@@ -87,34 +87,34 @@ name: flat_zero_waitcnt
 body: |
   bb.0:
     successors: %bb.1
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.global4)
-    $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16)
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.global4)
+    $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16)
     $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
     successors: %bb.2
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16)
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16)
     $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
     S_BRANCH %bb.2
 
   bb.2:
     successors: %bb.3
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4)
-    $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.flat16)
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4)
+    $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.flat16)
     $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
     S_BRANCH %bb.3
 
   bb.3:
     successors: %bb.4
-    $vgpr3 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4)
-    $vgpr4 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.global4)
+    $vgpr3 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4)
+    $vgpr4 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.global4)
     $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec
     S_BRANCH %bb.4
 
   bb.4:
-    $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4)
+    $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4)
     $vgpr0 = V_MOV_B32_e32 $vgpr5, implicit $exec
     S_ENDPGM 0
 ...
@@ -135,11 +135,11 @@ name: single_fallthrough_successor_no_end_block_wait
 body: |
   bb.0:
     successors: %bb.1
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
   bb.1:
     $vgpr3_vgpr4 = V_LSHLREV_B64_e64 4, $vgpr7_vgpr8, implicit $exec
-    FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 ---
@@ -162,16 +162,16 @@ name: single_branch_successor_not_next_block
 body: |
   bb.0:
     successors: %bb.2
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
    S_BRANCH %bb.2
 
   bb.1:
-    FLAT_STORE_DWORD $vgpr8_vgpr9, $vgpr10, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD $vgpr8_vgpr9, $vgpr10, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 
   bb.2:
      $vgpr3_vgpr4 = V_LSHLREV_B64_e64 4, $vgpr7_vgpr8, implicit $exec
-    FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
 ...
 
@@ -186,9 +186,9 @@ machineFunctionInfo:
 body: |
   bb.0:
     liveins: $vgpr1_vgpr2
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_WAITCNT 0
-    FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
 ...
 
@@ -208,12 +208,12 @@ machineFunctionInfo:
 body: |
   bb.0:
     liveins: $vgpr1_vgpr2
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     BUNDLE {
       S_NOP 0
       S_NOP 0
     }
-    FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
 ...
 
@@ -231,12 +231,12 @@ machineFunctionInfo:
 body: |
   bb.0:
     liveins: $vgpr1_vgpr2
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     BUNDLE {
       S_NOP 0
       S_WAITCNT 0
     }
-    FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
 ...
 
@@ -245,9 +245,9 @@ body: |
 # Def and use inside bundle
 # CHECK-LABEL: name: insert_in_bundle{{$}}
 # CHECK: BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
-# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 # CHECK-NEXT: S_WAITCNT 112
-# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, internal $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, internal $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 # CHECK-NEXT: }
 
 name: insert_in_bundle
@@ -258,8 +258,8 @@ body: |
   bb.0:
     liveins: $vgpr1_vgpr2
     BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    FLAT_STORE_DWORD $vgpr1_vgpr2, internal $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD $vgpr1_vgpr2, internal $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     }
 ...
 
@@ -269,10 +269,10 @@ body: |
 
 # CHECK-LABEL: name: exit_bundle{{$}}
 # CHECK: BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
-# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 # CHECK-NEXT: }
 # CHECK-NEXT: S_WAITCNT 112
-# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
 name: exit_bundle
 tracksRegLiveness: true
@@ -282,10 +282,10 @@ body: |
   bb.0:
     liveins: $vgpr1_vgpr2
     BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     }
 
-    FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 
 ...
 
@@ -295,11 +295,11 @@ body: |
 
 # CHECK-LABEL: name: cross_bundle{{$}}
 # CHECK: BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
-# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 # CHECK-NEXT: }
 # CHECK-NEXT: S_WAITCNT 112
 # CHECK-NEXT: BUNDLE implicit $vgpr0, implicit $vgpr1_vgpr2 {
-# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
 # CHECK-NEXT: }
 
 name: cross_bundle
@@ -310,10 +310,10 @@ body: |
   bb.0:
     liveins: $vgpr1_vgpr2
     BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 {
-    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     }
     BUNDLE implicit $vgpr0, implicit $vgpr1_vgpr2 {
-      FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+      FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     }
 ...
 
@@ -328,7 +328,7 @@ machineFunctionInfo:
 body: |
   bb.0:
     liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4
-      $vgpr0 = FLAT_LOAD_USHORT killed $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-      $vgpr1 = FLAT_LOAD_USHORT killed $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+      $vgpr0 = FLAT_LOAD_USHORT killed $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+      $vgpr1 = FLAT_LOAD_USHORT killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
       V_NOP_e32 implicit $exec, implicit $vgpr0_lo16, implicit $vgpr1_lo16
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir
index bed8092a3dd0..effac7e9290c 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.mir
+++ b/llvm/test/CodeGen/AMDGPU/wqm.mir
@@ -73,7 +73,7 @@ body:             |
 
   bb.1:
     S_CMP_LT_I32 0, %0:sgpr_32, implicit-def $scc
-    %10:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %3:vgpr_32, %13:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %10:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %3:vgpr_32, %13:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %12:vgpr_32 = V_ADD_CO_U32_e32 %3:vgpr_32, %3:vgpr_32, implicit-def $vcc, implicit $exec
     %5:sgpr_32 = S_CSELECT_B32 %2:sgpr_32, %1:sgpr_32, implicit $scc
     %11:vgpr_32 = V_ADD_CO_U32_e32 %5:sgpr_32, %12:vgpr_32, implicit-def $vcc, implicit $exec
@@ -130,14 +130,14 @@ body:             |
     %6:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
     %5:sgpr_128 = COPY %6
     %7:sreg_32 = S_MOV_B32 0
-    %8:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %6, %7, 0, 0, 0, 0, 0, 0, implicit $exec
+    %8:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %6, %7, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %16:vgpr_32 = COPY %8.sub1
     %11:vgpr_32 = COPY %16
     %10:vgpr_32 = V_SET_INACTIVE_B32 %11, undef %12:sreg_32, implicit $exec, implicit-def $scc
     %14:vgpr_32 = COPY %7
     %13:vgpr_32 = V_MOV_B32_dpp %14, killed %10, 323, 12, 15, 0, implicit $exec
     early-clobber %15:vgpr_32 = WWM killed %13, implicit $exec
-    BUFFER_STORE_DWORD_OFFSET_exact killed %15, %6, %7, 4, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET_exact killed %15, %6, %7, 4, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -162,7 +162,7 @@ body:             |
     %0:sgpr_32 = COPY $sgpr0
     %4:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
     %5:sreg_32 = S_MOV_B32 0
-    %6:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %4, %5, 0, 0, 0, 0, 0, 0, implicit $exec
+    %6:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %4, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
     %8:sreg_64 = COPY $exec
     %9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -172,7 +172,7 @@ body:             |
     early-clobber %13:sreg_32 = WWM %9:vgpr_32, implicit $exec
 
     %14:vgpr_32 = COPY %13
-    BUFFER_STORE_DWORD_OFFSET_exact killed %14, %4, %5, 4, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET_exact killed %14, %4, %5, 4, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -210,20 +210,20 @@ body:             |
     undef %7.sub0:vreg_64 = COPY %2:vgpr_32
     %7.sub1:vreg_64 = COPY %3:vgpr_32
 
-    %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %7:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+    %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %7:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
     S_CMP_EQ_U32 %8:sgpr_32, 0, implicit-def $scc
 
     undef %5.sub0:vreg_64 = nsz arcp nofpexcept V_ADD_F32_e64 0, %4.sub0:vreg_128, 0, %3:vgpr_32, 1, 0, implicit $mode, implicit $exec
     %5.sub1:vreg_64 = nsz arcp nofpexcept V_MUL_F32_e32 %2, %3, implicit $mode, implicit $exec
     %6:vgpr_32 = nsz arcp nofpexcept V_ADD_F32_e64 0, %2:vgpr_32, 0, %3:vgpr_32, 1, 0, implicit $mode, implicit $exec
 
-    %9:vreg_128 = IMAGE_SAMPLE_V4_V2 %5:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
+    %9:vreg_128 = IMAGE_SAMPLE_V4_V2 %5:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 
     S_CBRANCH_SCC0 %bb.2, implicit $scc
 
   bb.1:
     %10:sreg_32 = S_MOV_B32 0
-    BUFFER_STORE_DWORD_OFFSET_exact %6:vgpr_32, %101:sgpr_128, %10:sreg_32, 4, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET_exact %6:vgpr_32, %101:sgpr_128, %10:sreg_32, 4, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
   bb.2:

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll b/llvm/test/CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll
index 9166ebe314f8..a0818486af94 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll
@@ -4,7 +4,7 @@
 ; Test that custom pseudo source values can be round trip serialized through MIR.
 
 ; CHECK-LABEL: {{^}}name: shader
-; CHECK: %[[#]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed %17, %18, 4, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4, align 1, addrspace 4)
+; CHECK: %[[#]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed %17, %18, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4, align 1, addrspace 4)
 ; CHECK: IMAGE_STORE_V4_V3_nsa_gfx10 killed %[[#]], %[[#]], %[[#]], %[[#]], killed %[[#]], 15, 2, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "ImageResource")
 ; CHECK: DS_GWS_BARRIER %[[#]], 63, implicit $m0, implicit $exec :: (load 4 from custom "GWSResource")
 define amdgpu_cs void @shader(i32 %arg0, i32 %arg1, <8 x i32> inreg %arg2, <4 x i32> inreg %arg3) {

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir b/llvm/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir
index 8ad50b72c284..d96659c9dee1 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir
@@ -32,7 +32,7 @@
   }
 ...
 
-# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1)
+# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1)
 ---
 name: test1
 liveins:
@@ -56,14 +56,14 @@ body: |
     %5:vgpr_32 = COPY $vgpr0
     %6:vgpr_32 = COPY $vgpr1
 
-    BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
-    BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+    BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+    BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
 
     S_ENDPGM 0
 ...
 
-# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
-# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
 ---
 name: test2
 liveins:
@@ -87,14 +87,14 @@ body: |
     %5:vgpr_32 = COPY $vgpr0
     %6:vgpr_32 = COPY $vgpr1
 
-    BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
-    BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+    BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 1, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+    BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
 
     S_ENDPGM 0
 ...
 
-# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
-# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 1, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
 ---
 name: test3
 liveins:
@@ -118,13 +118,13 @@ body: |
     %5:vgpr_32 = COPY $vgpr0
     %6:vgpr_32 = COPY $vgpr1
 
-    BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
-    BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+    BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+    BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 1, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
 
     S_ENDPGM 0
 ...
 
-# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, 0, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1)
+# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, 0, 0, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1)
 ---
 name: test4
 liveins:
@@ -148,8 +148,8 @@ body: |
     %5:vgpr_32 = COPY $vgpr0
     %6:vgpr_32 = COPY $vgpr1
 
-    BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
-    BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+    BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 1, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+    BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 1, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
 
     S_ENDPGM 0
 ...

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/load-store-opt-scc.mir b/llvm/test/CodeGen/MIR/AMDGPU/load-store-opt-scc.mir
new file mode 100644
index 000000000000..9a889083564a
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/load-store-opt-scc.mir
@@ -0,0 +1,155 @@
+# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=si-load-store-opt -verify-machineinstrs -o - %s | FileCheck %s
+
+# The purpose of this test is to make sure we are combining relevant memory
+# operations correctly with/without SCC bit.
+
+--- |
+  define amdgpu_kernel void @test1(i32 addrspace(1)* %out) {
+    %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+    store i32 123, i32 addrspace(1)* %out.gep.1
+    store i32 456, i32 addrspace(1)* %out
+    ret void
+  }
+
+  define amdgpu_kernel void @test2(i32 addrspace(1)* %out) {
+    %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+    store i32 123, i32 addrspace(1)* %out.gep.1
+    store i32 456, i32 addrspace(1)* %out
+    ret void
+  }
+
+  define amdgpu_kernel void @test3(i32 addrspace(1)* %out) {
+    %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+    store i32 123, i32 addrspace(1)* %out.gep.1
+    store i32 456, i32 addrspace(1)* %out
+    ret void
+  }
+  define amdgpu_kernel void @test4(i32 addrspace(1)* %out) {
+    %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+    store i32 123, i32 addrspace(1)* %out.gep.1
+    store i32 456, i32 addrspace(1)* %out
+    ret void
+  }
+...
+
+# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1)
+---
+name: test1
+liveins:
+  - { reg: '$sgpr0_sgpr1', virtual-reg: '' }
+body: |
+  bb.0 (%ir-block.0):
+    liveins: $sgpr0_sgpr1
+
+    $vgpr0 = V_MOV_B32_e32 123, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 456, implicit $exec
+
+    $sgpr2 = S_MOV_B32 -1
+    $sgpr3 = S_MOV_B32 61440
+
+    %0:sgpr_64 = COPY $sgpr0_sgpr1
+    %1:sgpr_64 = S_LOAD_DWORDX2_IMM %1, 36, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4)
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:sgpr_128 = REG_SEQUENCE %1, %2, %3
+
+    %5:vgpr_32 = COPY $vgpr0
+    %6:vgpr_32 = COPY $vgpr1
+
+    BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+    BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+
+    S_ENDPGM 0
+...
+
+# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+---
+name: test2
+liveins:
+  - { reg: '$sgpr0_sgpr1', virtual-reg: '' }
+body: |
+  bb.0 (%ir-block.0):
+    liveins: $sgpr0_sgpr1
+
+    $vgpr0 = V_MOV_B32_e32 123, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 456, implicit $exec
+
+    $sgpr2 = S_MOV_B32 -1
+    $sgpr3 = S_MOV_B32 61440
+
+    %0:sgpr_64 = COPY $sgpr0_sgpr1
+    %1:sgpr_64 = S_LOAD_DWORDX2_IMM %1, 36, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4)
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:sgpr_128 = REG_SEQUENCE %1, %2, %3
+
+    %5:vgpr_32 = COPY $vgpr0
+    %6:vgpr_32 = COPY $vgpr1
+
+    BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+    BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+
+    S_ENDPGM 0
+...
+
+# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+---
+name: test3
+liveins:
+  - { reg: '$sgpr0_sgpr1', virtual-reg: '' }
+body: |
+  bb.0 (%ir-block.0):
+    liveins: $sgpr0_sgpr1
+
+    $vgpr0 = V_MOV_B32_e32 123, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 456, implicit $exec
+
+    $sgpr2 = S_MOV_B32 -1
+    $sgpr3 = S_MOV_B32 61440
+
+    %0:sgpr_64 = COPY $sgpr0_sgpr1
+    %1:sgpr_64 = S_LOAD_DWORDX2_IMM %1, 36, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4)
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:sgpr_128 = REG_SEQUENCE %1, %2, %3
+
+    %5:vgpr_32 = COPY $vgpr0
+    %6:vgpr_32 = COPY $vgpr1
+
+    BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+    BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+
+    S_ENDPGM 0
+...
+
+# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, 1, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1)
+---
+name: test4
+liveins:
+  - { reg: '$sgpr0_sgpr1', virtual-reg: '' }
+body: |
+  bb.0 (%ir-block.0):
+    liveins: $sgpr0_sgpr1
+
+    $vgpr0 = V_MOV_B32_e32 123, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 456, implicit $exec
+
+    $sgpr2 = S_MOV_B32 -1
+    $sgpr3 = S_MOV_B32 61440
+
+    %0:sgpr_64 = COPY $sgpr0_sgpr1
+    %1:sgpr_64 = S_LOAD_DWORDX2_IMM %1, 36, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4)
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:sgpr_128 = REG_SEQUENCE %1, %2, %3
+
+    %5:vgpr_32 = COPY $vgpr0
+    %6:vgpr_32 = COPY $vgpr1
+
+    BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+    BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
+
+    S_ENDPGM 0
+...

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/mir-canon-multi.mir b/llvm/test/CodeGen/MIR/AMDGPU/mir-canon-multi.mir
index 6bde24eb3319..24193e93751d 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/mir-canon-multi.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/mir-canon-multi.mir
@@ -19,7 +19,7 @@ body:             |
     ; CHECK: %bb0_{{[0-9]+}}__2:vgpr_32 = COPY %bb0_{{[0-9]+}}__1
     ; CHECK: %bb0_{{[0-9]+}}__1:vreg_64 = REG_SEQUENCE %bb0_{{[0-9]+}}__1, %subreg.sub0, %bb0_{{[0-9]+}}__1, %subreg.sub1
     ; CHECK: %bb0_{{[0-9]+}}__1:sgpr_128 = REG_SEQUENCE %bb0_{{[0-9]+}}__1, %subreg.sub0, %bb0_{{[0-9]+}}__1, %subreg.sub1, %bb0_{{[0-9]+}}__1, %subreg.sub2, %bb0_{{[0-9]+}}__2, %subreg.sub3
-    ; CHECK: BUFFER_STORE_DWORD_ADDR64 %bb0_{{[0-9]+}}__1, %bb0_{{[0-9]+}}__1, %bb0_{{[0-9]+}}__1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK: BUFFER_STORE_DWORD_ADDR64 %bb0_{{[0-9]+}}__1, %bb0_{{[0-9]+}}__1, %bb0_{{[0-9]+}}__1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; CHECK: S_ENDPGM 0
     %10:sreg_32_xm0 = S_MOV_B32 61440
     %11:sreg_32_xm0 = S_MOV_B32 0
@@ -35,7 +35,7 @@ body:             |
     %vreg123_3:vgpr_32 = COPY %5
     %16:sgpr_128 = REG_SEQUENCE killed %vreg123_0, %subreg.sub0, %vreg123_1, %subreg.sub1, %vreg123_2, %subreg.sub2, %vreg123_3, %subreg.sub3
 
-    BUFFER_STORE_DWORD_ADDR64 %vreg123_1, %27, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_ADDR64 %vreg123_1, %27, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir b/llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir
index 80230a59928d..8a34e84d2d1a 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir
@@ -12,7 +12,7 @@
 # CHECK: isEntryFunction: true
 # CHECK: scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
 # CHECK: frameOffsetReg:  '$sgpr50'
-# CHECK: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+# CHECK: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
 name: reserve_correct_register
 tracksRegLiveness: true
 machineFunctionInfo:
@@ -24,6 +24,6 @@ stack:
 
 body:             |
   bb.0:
-    renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     S_ENDPGM 0
 ...

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/syncscopes.mir b/llvm/test/CodeGen/MIR/AMDGPU/syncscopes.mir
index 44b45ec31b1e..47ecb1552789 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/syncscopes.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/syncscopes.mir
@@ -42,9 +42,9 @@
   !0 = !{i32 1}
 
 # GCN-LABEL: name: syncscopes
-# GCN: FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("agent") seq_cst 4 into %ir.agent_out, addrspace 4)
-# GCN: FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out, addrspace 4)
-# GCN: FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out, addrspace 4)
+# GCN: FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("agent") seq_cst 4 into %ir.agent_out, addrspace 4)
+# GCN: FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out, addrspace 4)
+# GCN: FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out, addrspace 4)
 ...
 ---
 name:            syncscopes
@@ -84,17 +84,17 @@ body:             |
     $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed $sgpr4_sgpr5, 40, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`)
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1, implicit $sgpr0_sgpr1, implicit $exec
     $vgpr2 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec
-    FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("agent") seq_cst 4 into %ir.agent_out)
+    FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("agent") seq_cst 4 into %ir.agent_out)
     S_WAITCNT 112
     $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3
     $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $sgpr2_sgpr3, implicit $exec
     $vgpr2 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec
-    FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out)
+    FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out)
     S_WAITCNT 112
     $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr4_sgpr5
     $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit killed $sgpr4_sgpr5, implicit $sgpr4_sgpr5, implicit $exec
     $vgpr2 = V_MOV_B32_e32 killed $sgpr8, implicit $exec, implicit $exec
-    FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out)
+    FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out)
     S_ENDPGM 0
 
 ...

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/target-index-operands.mir b/llvm/test/CodeGen/MIR/AMDGPU/target-index-operands.mir
index 1864f1cbec2f..f6a608ec9b31 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/target-index-operands.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/target-index-operands.mir
@@ -52,7 +52,7 @@ body: |
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
 ---
@@ -82,6 +82,6 @@ body: |
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...

diff  --git a/llvm/test/MC/AMDGPU/atomic-fadd-insts.s b/llvm/test/MC/AMDGPU/atomic-fadd-insts.s
index 70014c6fafc4..3b60a0ed2d08 100644
--- a/llvm/test/MC/AMDGPU/atomic-fadd-insts.s
+++ b/llvm/test/MC/AMDGPU/atomic-fadd-insts.s
@@ -41,7 +41,7 @@ buffer_atomic_add_f32 v5, off, s[8:11], s3 offset:7
 // GFX908: encoding: [0x07,0x00,0x34,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_add_f32 v5, off, s[8:11], s3 offset:4095 glc
-// GFX908-ERR: error: invalid operand for instruction
+// GFX908-ERR: error: operands are not valid for this GPU or mode
 
 buffer_atomic_add_f32 v5, off, s[8:11], s3 offset:4095 slc
 // GFX908: encoding: [0xff,0x0f,0x36,0xe1,0x00,0x05,0x02,0x03]
@@ -86,7 +86,7 @@ buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:7
 // GFX908: encoding: [0x07,0x00,0x38,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:4095 glc
-// GFX908-ERR: error: invalid operand for instruction
+// GFX908-ERR: error: operands are not valid for this GPU or mode
 
 buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:4095 slc
 // GFX908: encoding: [0xff,0x0f,0x3a,0xe1,0x00,0x05,0x02,0x03]

diff  --git a/llvm/test/MC/AMDGPU/dpp64.s b/llvm/test/MC/AMDGPU/dpp64.s
new file mode 100644
index 000000000000..c037bca8b16d
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/dpp64.s
@@ -0,0 +1,58 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -show-encoding %s | FileCheck %s --check-prefix=GFX90A
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 %s 2>&1 | FileCheck %s --check-prefix=GFX900 --implicit-check-not=error:
+
+// GFX90A: v_ceil_f64_dpp v[0:1], v[2:3]  row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x30,0x00,0x7e,0x02,0x51,0x01,0xff]
+// GFX900: error: not a valid operand.
+v_ceil_f64 v[0:1], v[2:3] row_newbcast:1
+
+// GFX90A: v_fmac_f64_dpp v[0:1], v[2:3], v[4:5]  row_newbcast:2 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x08,0x00,0x08,0x02,0x52,0x01,0xff]
+// GFX900: error: instruction not supported on this GPU
+v_fmac_f64 v[0:1], v[2:3], v[4:5] row_newbcast:2
+
+// GFX90A: v_cvt_f32_f64_dpp v5, v[0:1]  row_newbcast:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x1e,0x0a,0x7e,0x00,0x5f,0x01,0xff]
+// GFX900: error: not a valid operand.
+v_cvt_f32_f64 v5, v[0:1] row_newbcast:15
+
+// GFX90A: v_cvt_i32_f64_dpp v5, v[0:1]  row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x06,0x0a,0x7e,0x00,0x51,0x01,0xff]
+// GFX900: error: not a valid operand.
+v_cvt_i32_f64 v5, v[0:1] row_newbcast:1
+
+// GFX90A: v_cvt_u32_f64_dpp v5, v[0:1]  row_newbcast:2 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x2a,0x0a,0x7e,0x00,0x52,0x01,0xff]
+// GFX900: error: not a valid operand.
+v_cvt_u32_f64 v5, v[0:1] row_newbcast:2
+
+// GFX90A: v_floor_f64_dpp v[4:5], v[0:1]  row_newbcast:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x34,0x08,0x7e,0x00,0x5f,0x01,0xff]
+// GFX900: error: not a valid operand.
+v_floor_f64 v[4:5], v[0:1] row_newbcast:15
+
+// GFX90A: v_fract_f64_dpp v[4:5], v[0:1]  row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x64,0x08,0x7e,0x00,0x51,0x01,0xff]
+// GFX900: error: not a valid operand.
+v_fract_f64 v[4:5], v[0:1] row_newbcast:1
+
+// GFX90A: v_frexp_exp_i32_f64_dpp v5, v[0:1]  row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x60,0x0a,0x7e,0x00,0x51,0x01,0xff]
+// GFX900: error: not a valid operand.
+v_frexp_exp_i32_f64 v5, v[0:1] row_newbcast:1
+
+// GFX90A: v_frexp_mant_f64_dpp v[4:5], v[0:1]  row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x62,0x08,0x7e,0x00,0x51,0x01,0xff]
+// GFX900: error: not a valid operand.
+v_frexp_mant_f64 v[4:5], v[0:1] row_newbcast:1
+
+// GFX90A: v_rcp_f64_dpp v[4:5], v[0:1]  row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x4a,0x08,0x7e,0x00,0x51,0x01,0xff]
+// GFX900: error: not a valid operand.
+v_rcp_f64 v[4:5], v[0:1] row_newbcast:1
+
+// GFX90A: v_rndne_f64_dpp v[4:5], v[0:1]  row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x32,0x08,0x7e,0x00,0x51,0x01,0xff]
+// GFX900: error: not a valid operand.
+v_rndne_f64 v[4:5], v[0:1] row_newbcast:1
+
+// GFX90A: v_rsq_f64_dpp v[4:5], v[0:1]  row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x4c,0x08,0x7e,0x00,0x51,0x01,0xff]
+// GFX900: error: not a valid operand.
+v_rsq_f64 v[4:5], v[0:1] row_newbcast:1
+
+// GFX90A: v_sqrt_f64_dpp v[4:5], v[0:1]  row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x50,0x08,0x7e,0x00,0x51,0x01,0xff]
+// GFX900: error: not a valid operand.
+v_sqrt_f64 v[4:5], v[0:1] row_newbcast:1
+
+// GFX90A: v_trunc_f64_dpp v[4:5], v[0:1]  row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x2e,0x08,0x7e,0x00,0x51,0x01,0xff]
+// GFX900: error: not a valid operand.
+v_trunc_f64 v[4:5], v[0:1] row_newbcast:1

diff  --git a/llvm/test/MC/AMDGPU/gfx90a_asm_features.s b/llvm/test/MC/AMDGPU/gfx90a_asm_features.s
new file mode 100644
index 000000000000..fecf23b084de
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx90a_asm_features.s
@@ -0,0 +1,1030 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx908 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX908,NOT-GFX90A --implicit-check-not=error: %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX1010,NOT-GFX90A --implicit-check-not=error: %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -show-encoding %s | FileCheck --check-prefix=GFX90A %s
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c]
+v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c]
+v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c]
+v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c]
+v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0xb0,0xd3,0x00,0x01,0x10,0x04]
+v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] op_sel_hi:[0,0,0]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0xb0,0xd3,0x00,0x01,0x10,0x04]
+v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] op_sel:[0,0,1] op_sel_hi:[0,0,1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[1,1,1] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0xfc]
+v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[1,1,1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0xb0,0xd3,0x00,0x01,0x10,0x1c]
+v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[1,1,1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0xb0,0xd3,0x00,0x01,0x10,0xfc]
+v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[1,1,1] neg_hi:[1,1,1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x3c]
+v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[1,0,0]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x5c]
+v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[0,1,0]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x9c]
+v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[0,0,1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0xb0,0xd3,0x00,0x01,0x10,0x1c]
+v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[1,0,0]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0xb0,0xd3,0x00,0x01,0x10,0x1c]
+v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[0,1,0]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0xb0,0xd3,0x00,0x01,0x10,0x1c]
+v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[0,0,1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] clamp ; encoding: [0x08,0xc0,0xb0,0xd3,0x00,0x01,0x10,0x1c]
+v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] clamp
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_fma_f32 v[0:1], v[4:5], v[8:9], v[16:17] ; encoding: [0x00,0x40,0xb0,0xd3,0x04,0x11,0x42,0x1c]
+v_pk_fma_f32 v[0:1], v[4:5], v[8:9], v[16:17]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[254:255], v[8:9], v[16:17] ; encoding: [0xfe,0x00,0xb1,0xd3,0x08,0x21,0x02,0x18]
+v_pk_mul_f32 v[254:255], v[8:9], v[16:17]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], v[254:255], v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0xfe,0x21,0x02,0x18]
+v_pk_mul_f32 v[4:5], v[254:255], v[16:17]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], s[2:3], v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x02,0x20,0x02,0x18]
+v_pk_mul_f32 v[4:5], s[2:3], v[16:17]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], s[100:101], v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x64,0x20,0x02,0x18]
+v_pk_mul_f32 v[4:5], s[100:101], v[16:17]
+
+// GFX90A: v_pk_mul_f32 v[4:5], flat_scratch, v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x66,0x20,0x02,0x18]
+// NOT-GFX1010: error: instruction not supported on this GPU
+// NOT-GFX908: error: instruction not supported on this GPU
+v_pk_mul_f32 v[4:5], flat_scratch, v[16:17]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], vcc, v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x6a,0x20,0x02,0x18]
+v_pk_mul_f32 v[4:5], vcc, v[16:17]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], exec, v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x7e,0x20,0x02,0x18]
+v_pk_mul_f32 v[4:5], exec, v[16:17]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[254:255] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0xfd,0x03,0x18]
+v_pk_mul_f32 v[4:5], v[8:9], v[254:255]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], s[2:3] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x05,0x00,0x18]
+v_pk_mul_f32 v[4:5], v[8:9], s[2:3]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], s[100:101] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0xc9,0x00,0x18]
+v_pk_mul_f32 v[4:5], v[8:9], s[100:101]
+
+// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], flat_scratch ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0xcd,0x00,0x18]
+// NOT-GFX1010: error: instruction not supported on this GPU
+// NOT-GFX908: error: instruction not supported on this GPU
+v_pk_mul_f32 v[4:5], v[8:9], flat_scratch
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], vcc ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0xd5,0x00,0x18]
+v_pk_mul_f32 v[4:5], v[8:9], vcc
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], exec ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0xfd,0x00,0x18]
+v_pk_mul_f32 v[4:5], v[8:9], exec
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x18]
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0] ; encoding: [0x04,0x08,0xb1,0xd3,0x08,0x21,0x02,0x18]
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1] ; encoding: [0x04,0x10,0xb1,0xd3,0x08,0x21,0x02,0x18]
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1] ; encoding: [0x04,0x18,0xb1,0xd3,0x08,0x21,0x02,0x18]
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x18]
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x00]
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x08]
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x10]
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x38]
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x58]
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x78]
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0] ; encoding: [0x04,0x01,0xb1,0xd3,0x08,0x21,0x02,0x18]
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1] ; encoding: [0x04,0x02,0xb1,0xd3,0x08,0x21,0x02,0x18]
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1] ; encoding: [0x04,0x03,0xb1,0xd3,0x08,0x21,0x02,0x18]
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] clamp ; encoding: [0x04,0x80,0xb1,0xd3,0x08,0x21,0x02,0x18]
+v_pk_mul_f32 v[4:5], v[8:9], v[16:17] clamp
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[254:255], v[8:9], v[16:17] ; encoding: [0xfe,0x00,0xb2,0xd3,0x08,0x21,0x02,0x18]
+v_pk_add_f32 v[254:255], v[8:9], v[16:17]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], v[254:255], v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0xfe,0x21,0x02,0x18]
+v_pk_add_f32 v[4:5], v[254:255], v[16:17]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], s[2:3], v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x02,0x20,0x02,0x18]
+v_pk_add_f32 v[4:5], s[2:3], v[16:17]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], s[100:101], v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x64,0x20,0x02,0x18]
+v_pk_add_f32 v[4:5], s[100:101], v[16:17]
+
+// GFX90A: v_pk_add_f32 v[4:5], flat_scratch, v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x66,0x20,0x02,0x18]
+// NOT-GFX1010: error: instruction not supported on this GPU
+// NOT-GFX908: error: instruction not supported on this GPU
+v_pk_add_f32 v[4:5], flat_scratch, v[16:17]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], vcc, v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x6a,0x20,0x02,0x18]
+v_pk_add_f32 v[4:5], vcc, v[16:17]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], exec, v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x7e,0x20,0x02,0x18]
+v_pk_add_f32 v[4:5], exec, v[16:17]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[254:255] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0xfd,0x03,0x18]
+v_pk_add_f32 v[4:5], v[8:9], v[254:255]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], v[8:9], s[2:3] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x05,0x00,0x18]
+v_pk_add_f32 v[4:5], v[8:9], s[2:3]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], v[8:9], s[100:101] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0xc9,0x00,0x18]
+v_pk_add_f32 v[4:5], v[8:9], s[100:101]
+
+// GFX90A: v_pk_add_f32 v[4:5], v[8:9], flat_scratch ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0xcd,0x00,0x18]
+// NOT-GFX1010: error: instruction not supported on this GPU
+// NOT-GFX908: error: instruction not supported on this GPU
+v_pk_add_f32 v[4:5], v[8:9], flat_scratch
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], v[8:9], vcc ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0xd5,0x00,0x18]
+v_pk_add_f32 v[4:5], v[8:9], vcc
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], v[8:9], exec ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0xfd,0x00,0x18]
+v_pk_add_f32 v[4:5], v[8:9], exec
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x18]
+v_pk_add_f32 v[4:5], v[8:9], v[16:17]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0] ; encoding: [0x04,0x08,0xb2,0xd3,0x08,0x21,0x02,0x18]
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1] ; encoding: [0x04,0x10,0xb2,0xd3,0x08,0x21,0x02,0x18]
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1] ; encoding: [0x04,0x18,0xb2,0xd3,0x08,0x21,0x02,0x18]
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x18]
+v_pk_add_f32 v[4:5], v[8:9], v[16:17]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x00]
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x08]
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x10]
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x38]
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x58]
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x78]
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0] ; encoding: [0x04,0x01,0xb2,0xd3,0x08,0x21,0x02,0x18]
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1] ; encoding: [0x04,0x02,0xb2,0xd3,0x08,0x21,0x02,0x18]
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1] ; encoding: [0x04,0x03,0xb2,0xd3,0x08,0x21,0x02,0x18]
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] clamp ; encoding: [0x04,0x80,0xb2,0xd3,0x08,0x21,0x02,0x18]
+v_pk_add_f32 v[4:5], v[8:9], v[16:17] clamp
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mov_b32 v[0:1], v[2:3], v[4:5] ; encoding: [0x00,0x00,0xb3,0xd3,0x02,0x09,0x02,0x18]
+v_pk_mov_b32 v[0:1], v[2:3], v[4:5]
+
+// GFX90A: v_pk_mov_b32 v[0:1], flat_scratch, v[4:5] ; encoding: [0x00,0x00,0xb3,0xd3,0x66,0x08,0x02,0x18]
+// NOT-GFX1010: error: instruction not supported on this GPU
+// NOT-GFX908: error: instruction not supported on this GPU
+v_pk_mov_b32 v[0:1], flat_scratch, v[4:5]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mov_b32 v[0:1], v[2:3], vcc ; encoding: [0x00,0x00,0xb3,0xd3,0x02,0xd5,0x00,0x18]
+v_pk_mov_b32 v[0:1], v[2:3], vcc
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mov_b32 v[0:1], v[2:3], s[0:1] ; encoding: [0x00,0x00,0xb3,0xd3,0x02,0x01,0x00,0x18]
+v_pk_mov_b32 v[0:1], v[2:3], s[0:1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel_hi:[0,1] ; encoding: [0x00,0x00,0xb3,0xd3,0x02,0x05,0x02,0x10]
+v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel_hi:[0,1]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mov_b32 v[0:1], v[2:3], v[4:5] op_sel:[1,0] ; encoding: [0x00,0x08,0xb3,0xd3,0x02,0x09,0x02,0x18]
+v_pk_mov_b32 v[0:1], v[2:3], v[4:5] op_sel:[1,0]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_pk_mov_b32 v[0:1], v[2:3], v[4:5] op_sel:[1,1] ; encoding: [0x00,0x18,0xb3,0xd3,0x02,0x09,0x02,0x18]
+v_pk_mov_b32 v[0:1], v[2:3], v[4:5] op_sel:[1,1]
+
+// GFX90A: tbuffer_load_format_xyzw v[4:7], off, s[0:3], 0 scc ; encoding: [0x00,0x80,0x09,0xe8,0x00,0x04,0x20,0x80]
+// NOT-GFX1010: error: not a valid operand.
+// NOT-GFX908: error: failed parsing operand.
+tbuffer_load_format_xyzw v[4:7], off, s[0:3], dfmt:1, nfmt:0, 0 scc
+
+// GFX90A: tbuffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc scc ; encoding: [0x00,0xc0,0x09,0xe8,0x00,0x04,0x20,0x80]
+// NOT-GFX1010: error: not a valid operand.
+// NOT-GFX908: error: failed parsing operand.
+tbuffer_load_format_xyzw v[4:7], off, s[0:3], dfmt:1, nfmt:0, 0 glc scc
+
+// NOT-GFX90A: error: failed parsing operand
+// GFX90A: buffer_load_dword v5, off, s[8:11], s3 offset:4095 scc ; encoding: [0xff,0x8f,0x50,0xe0,0x00,0x05,0x02,0x03]
+buffer_load_dword v5, off, s[8:11], s3 offset:4095 scc
+
+// NOT-GFX90A: error: failed parsing operand
+// GFX90A: buffer_load_dword v5, off, s[8:11], s3 offset:4095 glc scc ; encoding: [0xff,0xcf,0x50,0xe0,0x00,0x05,0x02,0x03]
+buffer_load_dword v5, off, s[8:11], s3 offset:4095 glc scc
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_wbl2 ; encoding: [0x00,0x00,0xa0,0xe0,0x00,0x00,0x00,0x00]
+buffer_wbl2
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_invl2 ; encoding: [0x00,0x00,0xa4,0xe0,0x00,0x00,0x00,0x00]
+buffer_invl2
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x03]
+buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x03,0x03]
+buffer_atomic_add_f64 v[4:5], off, s[12:15], s3 offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x18,0x03]
+buffer_atomic_add_f64 v[4:5], off, s[96:99], s3 offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x65]
+buffer_atomic_add_f64 v[4:5], off, s[8:11], s101 offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x7c]
+buffer_atomic_add_f64 v[4:5], off, s[8:11], m0 offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x80]
+buffer_atomic_add_f64 v[4:5], off, s[8:11], 0 offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0xc1]
+buffer_atomic_add_f64 v[4:5], off, s[8:11], -1 offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_add_f64 v[4:5], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x3c,0xe1,0x00,0x04,0x02,0x03]
+buffer_atomic_add_f64 v[4:5], v0, s[8:11], s3 idxen offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_add_f64 v[4:5], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x3c,0xe1,0x00,0x04,0x02,0x03]
+buffer_atomic_add_f64 v[4:5], v0, s[8:11], s3 offen offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x3c,0xe1,0x00,0x04,0x02,0x03]
+buffer_atomic_add_f64 v[4:5], off, s[8:11], s3
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x3c,0xe1,0x00,0x04,0x02,0x03]
+buffer_atomic_add_f64 v[4:5], off, s[8:11], s3
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x3c,0xe1,0x00,0x04,0x02,0x03]
+buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 offset:7
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x3e,0xe1,0x00,0x04,0x02,0x03]
+buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 offset:4095 slc
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x03]
+buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x03,0x03]
+buffer_atomic_min_f64 v[4:5], off, s[12:15], s3 offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x18,0x03]
+buffer_atomic_min_f64 v[4:5], off, s[96:99], s3 offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x65]
+buffer_atomic_min_f64 v[4:5], off, s[8:11], s101 offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x7c]
+buffer_atomic_min_f64 v[4:5], off, s[8:11], m0 offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x80]
+buffer_atomic_min_f64 v[4:5], off, s[8:11], 0 offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0xc1]
+buffer_atomic_min_f64 v[4:5], off, s[8:11], -1 offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_min_f64 v[4:5], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x40,0xe1,0x00,0x04,0x02,0x03]
+buffer_atomic_min_f64 v[4:5], v0, s[8:11], s3 idxen offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_min_f64 v[4:5], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x40,0xe1,0x00,0x04,0x02,0x03]
+buffer_atomic_min_f64 v[4:5], v0, s[8:11], s3 offen offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x40,0xe1,0x00,0x04,0x02,0x03]
+buffer_atomic_min_f64 v[4:5], off, s[8:11], s3
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x40,0xe1,0x00,0x04,0x02,0x03]
+buffer_atomic_min_f64 v[4:5], off, s[8:11], s3
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x40,0xe1,0x00,0x04,0x02,0x03]
+buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 offset:7
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x42,0xe1,0x00,0x04,0x02,0x03]
+buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 offset:4095 slc
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x03]
+buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x03,0x03]
+buffer_atomic_max_f64 v[4:5], off, s[12:15], s3 offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x18,0x03]
+buffer_atomic_max_f64 v[4:5], off, s[96:99], s3 offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x65]
+buffer_atomic_max_f64 v[4:5], off, s[8:11], s101 offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x7c]
+buffer_atomic_max_f64 v[4:5], off, s[8:11], m0 offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x80]
+buffer_atomic_max_f64 v[4:5], off, s[8:11], 0 offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0xc1]
+buffer_atomic_max_f64 v[4:5], off, s[8:11], -1 offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_max_f64 v[4:5], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x44,0xe1,0x00,0x04,0x02,0x03]
+buffer_atomic_max_f64 v[4:5], v0, s[8:11], s3 idxen offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_max_f64 v[4:5], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x44,0xe1,0x00,0x04,0x02,0x03]
+buffer_atomic_max_f64 v[4:5], v0, s[8:11], s3 offen offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x44,0xe1,0x00,0x04,0x02,0x03]
+buffer_atomic_max_f64 v[4:5], off, s[8:11], s3
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x44,0xe1,0x00,0x04,0x02,0x03]
+buffer_atomic_max_f64 v[4:5], off, s[8:11], s3
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x44,0xe1,0x00,0x04,0x02,0x03]
+buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 offset:7
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x46,0xe1,0x00,0x04,0x02,0x03]
+buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 offset:4095 slc
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: ds_add_f64 v1, v[2:3] offset:65535 ; encoding: [0xff,0xff,0xb8,0xd8,0x01,0x02,0x00,0x00]
+ds_add_f64 v1, v[2:3] offset:65535
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: ds_add_f64 v255, v[2:3] offset:65535 ; encoding: [0xff,0xff,0xb8,0xd8,0xff,0x02,0x00,0x00]
+ds_add_f64 v255, v[2:3] offset:65535
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: ds_add_f64 v1, v[254:255] offset:65535 ; encoding: [0xff,0xff,0xb8,0xd8,0x01,0xfe,0x00,0x00]
+ds_add_f64 v1, v[254:255] offset:65535
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: ds_add_f64 v1, v[2:3] ; encoding: [0x00,0x00,0xb8,0xd8,0x01,0x02,0x00,0x00]
+ds_add_f64 v1, v[2:3]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: ds_add_f64 v1, v[2:3] ; encoding: [0x00,0x00,0xb8,0xd8,0x01,0x02,0x00,0x00]
+ds_add_f64 v1, v[2:3]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: ds_add_f64 v1, v[2:3] offset:4 ; encoding: [0x04,0x00,0xb8,0xd8,0x01,0x02,0x00,0x00]
+ds_add_f64 v1, v[2:3] offset:4
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: ds_add_f64 v1, v[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xb9,0xd8,0x01,0x02,0x00,0x00]
+ds_add_f64 v1, v[2:3] offset:65535 gds
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: ds_add_rtn_f64 v[4:5], v1, v[2:3] offset:65535 ; encoding: [0xff,0xff,0xf8,0xd8,0x01,0x02,0x00,0x04]
+ds_add_rtn_f64 v[4:5], v1, v[2:3] offset:65535
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: ds_add_rtn_f64 v[254:255], v1, v[2:3] offset:65535 ; encoding: [0xff,0xff,0xf8,0xd8,0x01,0x02,0x00,0xfe]
+ds_add_rtn_f64 v[254:255], v1, v[2:3] offset:65535
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: ds_add_rtn_f64 v[4:5], v255, v[2:3] offset:65535 ; encoding: [0xff,0xff,0xf8,0xd8,0xff,0x02,0x00,0x04]
+ds_add_rtn_f64 v[4:5], v255, v[2:3] offset:65535
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: ds_add_rtn_f64 v[4:5], v1, v[254:255] offset:65535 ; encoding: [0xff,0xff,0xf8,0xd8,0x01,0xfe,0x00,0x04]
+ds_add_rtn_f64 v[4:5], v1, v[254:255] offset:65535
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: ds_add_rtn_f64 v[4:5], v1, v[2:3] ; encoding: [0x00,0x00,0xf8,0xd8,0x01,0x02,0x00,0x04]
+ds_add_rtn_f64 v[4:5], v1, v[2:3]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: ds_add_rtn_f64 v[4:5], v1, v[2:3] ; encoding: [0x00,0x00,0xf8,0xd8,0x01,0x02,0x00,0x04]
+ds_add_rtn_f64 v[4:5], v1, v[2:3]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: ds_add_rtn_f64 v[4:5], v1, v[2:3] offset:4 ; encoding: [0x04,0x00,0xf8,0xd8,0x01,0x02,0x00,0x04]
+ds_add_rtn_f64 v[4:5], v1, v[2:3] offset:4
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: ds_add_rtn_f64 v[4:5], v1, v[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xf9,0xd8,0x01,0x02,0x00,0x04]
+ds_add_rtn_f64 v[4:5], v1, v[2:3] offset:65535 gds
+
+// NOT-GFX90A: error: failed parsing operand
+// GFX90A: flat_load_dword v0, v[0:1] scc ; encoding: [0x00,0x00,0x50,0xde,0x00,0x00,0x00,0x00]
+flat_load_dword v0, v[0:1] scc
+
+// NOT-GFX90A: error: failed parsing operand
+// GFX90A: flat_load_dword v0, v[0:1] glc scc ; encoding: [0x00,0x00,0x51,0xde,0x00,0x00,0x00,0x00]
+flat_load_dword v0, v[0:1] glc scc
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x3c,0xdd,0x00,0x02,0x00,0x00]
+flat_atomic_add_f64 v[0:1], v[2:3] offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_add_f64 v[254:255], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x3c,0xdd,0xfe,0x02,0x00,0x00]
+flat_atomic_add_f64 v[254:255], v[2:3] offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_add_f64 v[0:1], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x3c,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_add_f64 v[0:1], v[254:255] offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_add_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x3c,0xdd,0x00,0x02,0x00,0x00]
+flat_atomic_add_f64 v[0:1], v[2:3]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_add_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x3c,0xdd,0x00,0x02,0x00,0x00]
+flat_atomic_add_f64 v[0:1], v[2:3]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_add_f64 v[0:1], v[2:3] offset:7 ; encoding: [0x07,0x00,0x3c,0xdd,0x00,0x02,0x00,0x00]
+flat_atomic_add_f64 v[0:1], v[2:3] offset:7
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x3d,0xdd,0x00,0x02,0x00,0x00]
+flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc scc ; encoding: [0xff,0x0f,0x3d,0xdf,0x00,0x02,0x00,0x00]
+flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc scc
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x3e,0xdd,0x00,0x02,0x00,0x00]
+flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 slc
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_min_f64 v[0:1], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdd,0x00,0x02,0x00,0x00]
+flat_atomic_min_f64 v[0:1], v[2:3] offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_min_f64 v[254:255], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdd,0xfe,0x02,0x00,0x00]
+flat_atomic_min_f64 v[254:255], v[2:3] offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_min_f64 v[0:1], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_min_f64 v[0:1], v[254:255] offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_min_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x40,0xdd,0x00,0x02,0x00,0x00]
+flat_atomic_min_f64 v[0:1], v[2:3]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_min_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x40,0xdd,0x00,0x02,0x00,0x00]
+flat_atomic_min_f64 v[0:1], v[2:3]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_min_f64 v[0:1], v[2:3] offset:7 ; encoding: [0x07,0x00,0x40,0xdd,0x00,0x02,0x00,0x00]
+flat_atomic_min_f64 v[0:1], v[2:3] offset:7
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x41,0xdd,0x00,0x02,0x00,0x00]
+flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_min_f64 v[0:1], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x42,0xdd,0x00,0x02,0x00,0x00]
+flat_atomic_min_f64 v[0:1], v[2:3] offset:4095 slc
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_max_f64 v[0:1], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdd,0x00,0x02,0x00,0x00]
+flat_atomic_max_f64 v[0:1], v[2:3] offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_max_f64 v[254:255], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdd,0xfe,0x02,0x00,0x00]
+flat_atomic_max_f64 v[254:255], v[2:3] offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_max_f64 v[0:1], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_max_f64 v[0:1], v[254:255] offset:4095
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_max_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x44,0xdd,0x00,0x02,0x00,0x00]
+flat_atomic_max_f64 v[0:1], v[2:3]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_max_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x44,0xdd,0x00,0x02,0x00,0x00]
+flat_atomic_max_f64 v[0:1], v[2:3]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_max_f64 v[0:1], v[2:3] offset:7 ; encoding: [0x07,0x00,0x44,0xdd,0x00,0x02,0x00,0x00]
+flat_atomic_max_f64 v[0:1], v[2:3] offset:7
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x45,0xdd,0x00,0x02,0x00,0x00]
+flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: flat_atomic_max_f64 v[0:1], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x46,0xdd,0x00,0x02,0x00,0x00]
+flat_atomic_max_f64 v[0:1], v[2:3] offset:4095 slc
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: global_atomic_add_f64 v[0:1], v[2:3], off ; encoding: [0x00,0x80,0x3c,0xdd,0x00,0x02,0x7f,0x00]
+global_atomic_add_f64 v[0:1], v[2:3], off
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: global_atomic_min_f64 v[0:1], v[2:3], off ; encoding: [0x00,0x80,0x40,0xdd,0x00,0x02,0x7f,0x00]
+global_atomic_min_f64 v[0:1], v[2:3], off
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: global_atomic_max_f64 v[0:1], v[2:3], off ; encoding: [0x00,0x80,0x44,0xdd,0x00,0x02,0x7f,0x00]
+global_atomic_max_f64 v[0:1], v[2:3], off
+
+// NOT-GFX90A: error: failed parsing operand
+// GFX90A: image_load v[0:4], v2, s[0:7] dmask:0xf unorm scc ; encoding: [0x80,0x1f,0x00,0xf0,0x02,0x00,0x00,0x00]
+image_load v[0:4], v2, s[0:7] dmask:0xf unorm scc
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x08]
+v_fmac_f64_e32 v[4:5], v[2:3], v[4:5]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x09]
+v_fmac_f64_e32 v[254:255], v[2:3], v[4:5]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x08]
+v_fmac_f64_e32 v[4:5], v[254:255], v[4:5]
+
+// GFX90A: v_fmac_f64_e32 v[4:5], flat_scratch, v[4:5] ; encoding: [0x66,0x08,0x08,0x08]
+// NOT-GFX1010: error: instruction not supported on this GPU
+// NOT-GFX908: error: instruction not supported on this GPU
+v_fmac_f64_e32 v[4:5], flat_scratch, v[4:5]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x08]
+v_fmac_f64_e32 v[4:5], vcc, v[4:5]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x08]
+v_fmac_f64_e32 v[4:5], exec, v[4:5]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x08]
+v_fmac_f64_e32 v[4:5], 0, v[4:5]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x08]
+v_fmac_f64_e32 v[4:5], -1, v[4:5]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x08]
+v_fmac_f64_e32 v[4:5], 0.5, v[4:5]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x08]
+v_fmac_f64_e32 v[4:5], -4.0, v[4:5]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xff,0x08,0x08,0x08,0x56,0x34,0x12,0xaf]
+v_fmac_f64_e32 v[4:5], 0xaf123456, v[4:5]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x08,0x73,0x72,0x71,0x3f]
+v_fmac_f64_e32 v[4:5], 0x3f717273, v[4:5]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x08]
+v_fmac_f64_e32 v[4:5], v[2:3], v[254:255]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x00]
+v_fmac_f64_e64 v[4:5], v[2:3], v[8:9]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[254:255], v[2:3], v[8:9] ; encoding: [0xfe,0x00,0x04,0xd1,0x02,0x11,0x02,0x00]
+v_fmac_f64_e64 v[254:255], v[2:3], v[8:9]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], v[254:255], v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0xfe,0x11,0x02,0x00]
+v_fmac_f64_e64 v[4:5], v[254:255], v[8:9]
+
+// GFX90A: v_fmac_f64_e64 v[4:5], flat_scratch, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x66,0x10,0x02,0x00]
+// NOT-GFX1010: error: instruction not supported on this GPU
+// NOT-GFX908: error: instruction not supported on this GPU
+v_fmac_f64_e64 v[4:5], flat_scratch, v[8:9]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], vcc, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x6a,0x10,0x02,0x00]
+v_fmac_f64_e64 v[4:5], vcc, v[8:9]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], exec, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x7e,0x10,0x02,0x00]
+v_fmac_f64_e64 v[4:5], exec, v[8:9]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], 0, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x80,0x10,0x02,0x00]
+v_fmac_f64_e64 v[4:5], 0, v[8:9]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], -1, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0xc1,0x10,0x02,0x00]
+v_fmac_f64_e64 v[4:5], -1, v[8:9]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], 0.5, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0xf0,0x10,0x02,0x00]
+v_fmac_f64_e64 v[4:5], 0.5, v[8:9]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], -4.0, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0xf7,0x10,0x02,0x00]
+v_fmac_f64_e64 v[4:5], -4.0, v[8:9]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[254:255] ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xfd,0x03,0x00]
+v_fmac_f64_e64 v[4:5], v[2:3], v[254:255]
+
+// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], flat_scratch ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xcd,0x00,0x00]
+// NOT-GFX1010: error: instruction not supported on this GPU
+// NOT-GFX908: error: instruction not supported on this GPU
+v_fmac_f64_e64 v[4:5], v[2:3], flat_scratch
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xd5,0x00,0x00]
+v_fmac_f64_e64 v[4:5], v[2:3], vcc
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xfd,0x00,0x00]
+v_fmac_f64_e64 v[4:5], v[2:3], exec
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x01,0x01,0x00]
+v_fmac_f64_e64 v[4:5], v[2:3], 0
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x83,0x01,0x00]
+v_fmac_f64_e64 v[4:5], v[2:3], -1
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xe1,0x01,0x00]
+v_fmac_f64_e64 v[4:5], v[2:3], 0.5
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xef,0x01,0x00]
+v_fmac_f64_e64 v[4:5], v[2:3], -4.0
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], -v[2:3], v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x20]
+v_fmac_f64_e64 v[4:5], -v[2:3], v[8:9]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], -v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x40]
+v_fmac_f64_e64 v[4:5], v[2:3], -v[8:9]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], -v[2:3], -v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x60]
+v_fmac_f64_e64 v[4:5], -v[2:3], -v[8:9]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], |v[2:3]|, v[8:9] ; encoding: [0x04,0x01,0x04,0xd1,0x02,0x11,0x02,0x00]
+v_fmac_f64_e64 v[4:5], |v[2:3]|, v[8:9]
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], |v[8:9]| ; encoding: [0x04,0x02,0x04,0xd1,0x02,0x11,0x02,0x00]
+v_fmac_f64_e64 v[4:5], v[2:3], |v[8:9]|
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], |v[2:3]|, |v[8:9]| ; encoding: [0x04,0x03,0x04,0xd1,0x02,0x11,0x02,0x00]
+v_fmac_f64_e64 v[4:5], |v[2:3]|, |v[8:9]|
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x04,0xd1,0x02,0x11,0x02,0x00]
+v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] clamp
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] mul:2 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x08]
+v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] mul:2
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] mul:4 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x10]
+v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] mul:4
+
+// NOT-GFX90A: error: instruction not supported on this GPU
+// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] div:2 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x18]
+v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] div:2
+
+// GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00]
+v_mul_legacy_f32_e64 v5, v1, v2
+
+// GFX90A: v_mul_legacy_f32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00]
+v_mul_legacy_f32_e64 v255, v1, v2
+
+// GFX90A: v_mul_legacy_f32_e64 v5, v255, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xff,0x05,0x02,0x00]
+v_mul_legacy_f32_e64 v5, v255, v2
+
+// GFX90A: v_mul_legacy_f32_e64 v5, s1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x04,0x02,0x00]
+v_mul_legacy_f32_e64 v5, s1, v2
+
+// GFX90A: v_mul_legacy_f32_e64 v5, s101, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x65,0x04,0x02,0x00]
+v_mul_legacy_f32_e64 v5, s101, v2
+
+// GFX90A: v_mul_legacy_f32_e64 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x6a,0x04,0x02,0x00]
+v_mul_legacy_f32_e64 v5, vcc_lo, v2
+
+// GFX90A: v_mul_legacy_f32_e64 v5, vcc_hi, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x6b,0x04,0x02,0x00]
+v_mul_legacy_f32_e64 v5, vcc_hi, v2
+
+// GFX90A: v_mul_legacy_f32_e64 v5, m0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7c,0x04,0x02,0x00]
+v_mul_legacy_f32_e64 v5, m0, v2
+
+// GFX90A: v_mul_legacy_f32_e64 v5, exec_lo, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7e,0x04,0x02,0x00]
+v_mul_legacy_f32_e64 v5, exec_lo, v2
+
+// GFX90A: v_mul_legacy_f32_e64 v5, exec_hi, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7f,0x04,0x02,0x00]
+v_mul_legacy_f32_e64 v5, exec_hi, v2
+
+// GFX90A: v_mul_legacy_f32_e64 v5, 0, v2  ; encoding: [0x05,0x00,0xa1,0xd2,0x80,0x04,0x02,0x00]
+v_mul_legacy_f32_e64 v5, 0, v2
+
+// GFX90A: v_mul_legacy_f32_e64 v5, -1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xc1,0x04,0x02,0x00]
+v_mul_legacy_f32_e64 v5, -1, v2
+
+// GFX90A: v_mul_legacy_f32_e64 v5, 0.5, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xf0,0x04,0x02,0x00]
+v_mul_legacy_f32_e64 v5, 0.5, v2
+
+// GFX90A: v_mul_legacy_f32_e64 v5, -4.0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xf7,0x04,0x02,0x00]
+v_mul_legacy_f32_e64 v5, -4.0, v2
+
+// GFX90A: v_mul_legacy_f32_e64 v5, v1, v255 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xff,0x03,0x00]
+v_mul_legacy_f32_e64 v5, v1, v255
+
+// GFX90A: v_mul_legacy_f32_e64 v5, v1, s2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x00,0x00]
+v_mul_legacy_f32_e64 v5, v1, s2
+
+// GFX90A: v_mul_legacy_f32_e64 v5, v1, s101 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xcb,0x00,0x00]
+v_mul_legacy_f32_e64 v5, v1, s101
+
+// GFX90A: v_mul_legacy_f32_e64 v5, v1, vcc_lo ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xd5,0x00,0x00]
+v_mul_legacy_f32_e64 v5, v1, vcc_lo
+
+// GFX90A: v_mul_legacy_f32_e64 v5, v1, vcc_hi ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xd7,0x00,0x00]
+v_mul_legacy_f32_e64 v5, v1, vcc_hi
+
+// GFX90A: v_mul_legacy_f32_e64 v5, v1, m0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xf9,0x00,0x00]
+v_mul_legacy_f32_e64 v5, v1, m0
+
+// GFX90A: v_mul_legacy_f32_e64 v5, v1, exec_lo ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xfd,0x00,0x00]
+v_mul_legacy_f32_e64 v5, v1, exec_lo
+
+// GFX90A: v_mul_legacy_f32_e64 v5, v1, exec_hi ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xff,0x00,0x00]
+v_mul_legacy_f32_e64 v5, v1, exec_hi
+
+// GFX90A: v_mul_legacy_f32_e64 v5, v1, 0  ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x01,0x01,0x00]
+v_mul_legacy_f32_e64 v5, v1, 0
+
+// GFX90A: v_mul_legacy_f32_e64 v5, v1, -1 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x83,0x01,0x00]
+v_mul_legacy_f32_e64 v5, v1, -1
+
+// GFX90A: v_mul_legacy_f32_e64 v5, v1, 0.5 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xe1,0x01,0x00]
+v_mul_legacy_f32_e64 v5, v1, 0.5
+
+// GFX90A: v_mul_legacy_f32_e64 v5, v1, -4.0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xef,0x01,0x00]
+v_mul_legacy_f32_e64 v5, v1, -4.0
+
+// GFX90A: v_mul_legacy_f32_e64 v5, -v1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x20]
+v_mul_legacy_f32_e64 v5, -v1, v2
+
+// GFX90A: v_mul_legacy_f32_e64 v5, v1, -v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x40]
+v_mul_legacy_f32_e64 v5, v1, -v2
+
+// GFX90A: v_mul_legacy_f32_e64 v5, -v1, -v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x60]
+v_mul_legacy_f32_e64 v5, -v1, -v2
+
+// GFX90A: v_mul_legacy_f32_e64 v5, |v1|, v2 ; encoding: [0x05,0x01,0xa1,0xd2,0x01,0x05,0x02,0x00]
+v_mul_legacy_f32_e64 v5, |v1|, v2
+
+// GFX90A: v_mul_legacy_f32_e64 v5, v1, |v2| ; encoding: [0x05,0x02,0xa1,0xd2,0x01,0x05,0x02,0x00]
+v_mul_legacy_f32_e64 v5, v1, |v2|
+
+// GFX90A: v_mul_legacy_f32_e64 v5, |v1|, |v2| ; encoding: [0x05,0x03,0xa1,0xd2,0x01,0x05,0x02,0x00]
+v_mul_legacy_f32_e64 v5, |v1|, |v2|
+
+// GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 clamp ; encoding: [0x05,0x80,0xa1,0xd2,0x01,0x05,0x02,0x00]
+v_mul_legacy_f32_e64 v5, v1, v2 clamp
+
+// GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 mul:2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x08]
+v_mul_legacy_f32_e64 v5, v1, v2 mul:2
+
+// GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 mul:4 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x10]
+v_mul_legacy_f32_e64 v5, v1, v2 mul:4
+
+// GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 div:2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x18]
+v_mul_legacy_f32_e64 v5, v1, v2 div:2
+
+// GFX90A: v_xor_b32_dpp v6, v29, v27  row_newbcast:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x36,0x0c,0x2a,0x1d,0x50,0x01,0xff]
+// NOT-GFX90A: error: not a valid operand.
+v_xor_b32 v6, v29, v27 row_newbcast:0
+
+// GFX90A: v_xor_b32_dpp v6, v29, v27  row_newbcast:7 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x36,0x0c,0x2a,0x1d,0x57,0x01,0xff]
+// NOT-GFX90A: error: not a valid operand.
+v_xor_b32 v6, v29, v27 row_newbcast:7
+
+// GFX90A: v_xor_b32_dpp v6, v29, v27  row_newbcast:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x36,0x0c,0x2a,0x1d,0x5f,0x01,0xff]
+// NOT-GFX90A: error: not a valid operand.
+v_xor_b32 v6, v29, v27 row_newbcast:15
+
+// GFX90A: buffer_atomic_add_f32 v0, v2, s[4:7], 0 idxen glc ; encoding: [0x00,0x60,0x34,0xe1,0x02,0x00,0x01,0x80]
+// NOT-GFX1010: error: instruction not supported on this GPU
+// NOT-GFX908: error: operands are not valid for this GPU or mode
+buffer_atomic_add_f32 v0, v2, s[4:7], 0 idxen glc
+
+// GFX90A: buffer_atomic_add_f32 v0, v2, s[4:7], 0 idxen glc ; encoding: [0x00,0x60,0x34,0xe1,0x02,0x00,0x01,0x80]
+// NOT-GFX1010: error: instruction not supported on this GPU
+// NOT-GFX908: error: operands are not valid for this GPU or mode
+buffer_atomic_add_f32 v0, v2, s[4:7], 0 idxen glc
+
+// GFX90A: buffer_atomic_pk_add_f16 v0, v2, s[4:7], 0 idxen glc ; encoding: [0x00,0x60,0x38,0xe1,0x02,0x00,0x01,0x80]
+// NOT-GFX1010: error: instruction not supported on this GPU
+// NOT-GFX908: error: operands are not valid for this GPU or mode
+buffer_atomic_pk_add_f16 v0, v2, s[4:7], 0 idxen glc
+
+// GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc ; encoding: [0x00,0x80,0x35,0xdd,0x00,0x02,0x7f,0x00]
+// NOT-GFX1010: error: instruction not supported on this GPU
+// NOT-GFX908: error: operands are not valid for this GPU or mode
+global_atomic_add_f32 v0, v[0:1], v2, off glc
+
+// GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc ; encoding: [0x00,0x80,0x39,0xdd,0x00,0x02,0x7f,0x00]
+// NOT-GFX1010: error: instruction not supported on this GPU
+// NOT-GFX908: error: operands are not valid for this GPU or mode
+global_atomic_pk_add_f16 v0, v[0:1], v2, off glc
+
+// GFX90A: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc ; encoding: [0x00,0x80,0x3d,0xdd,0x00,0x02,0x7f,0x00]
+// NOT-GFX90A: error: instruction not supported on this GPU
+global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
+
+// GFX90A: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc ; encoding: [0x00,0x80,0x45,0xdd,0x00,0x02,0x7f,0x00]
+// NOT-GFX90A: error: instruction not supported on this GPU
+global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
+
+// GFX90A: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc ; encoding: [0x00,0x80,0x41,0xdd,0x00,0x02,0x7f,0x00]
+// NOT-GFX90A: error: instruction not supported on this GPU
+global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
+
+// GFX90A: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc ; encoding: [0x00,0x00,0x3d,0xdd,0x00,0x02,0x00,0x00]
+// NOT-GFX90A: error: instruction not supported on this GPU
+flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
+
+// GFX90A: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc ; encoding: [0x00,0x00,0x45,0xdd,0x00,0x02,0x00,0x00]
+// NOT-GFX90A: error: instruction not supported on this GPU
+flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
+
+// GFX90A: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc ; encoding: [0x00,0x00,0x41,0xdd,0x00,0x02,0x00,0x00]
+// NOT-GFX90A: error: instruction not supported on this GPU
+flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc

diff  --git a/llvm/test/MC/AMDGPU/gfx90a_err.s b/llvm/test/MC/AMDGPU/gfx90a_err.s
new file mode 100644
index 000000000000..39df2999404f
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx90a_err.s
@@ -0,0 +1,196 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefix=GFX90A --implicit-check-not=error: %s
+
+ds_add_src2_u32 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_add_src2_f32 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_sub_src2_u32 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_rsub_src2_u32 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_inc_src2_u32 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_dec_src2_u32 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_min_src2_i32 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_max_src2_i32 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_min_src2_u32 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_max_src2_u32 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_and_src2_b32 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_or_src2_b32 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_xor_src2_b32 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_min_src2_f32 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_max_src2_f32 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_add_src2_u64 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_sub_src2_u64 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_rsub_src2_u64 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_inc_src2_u64 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_dec_src2_u64 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_min_src2_i64 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_max_src2_i64 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_min_src2_u64 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_max_src2_u64 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_and_src2_b64 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_or_src2_b64 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_xor_src2_b64 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_min_src2_f64 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_max_src2_f64 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_write_src2_b32 v1
+// GFX90A: error: instruction not supported on this GPU
+
+ds_write_src2_b64 v1
+// GFX90A: error: instruction not supported on this GPU
+
+image_gather4 v[5:8], v1, s[8:15], s[12:15]
+// GFX90A: error: instruction not supported on this GPU
+
+image_get_lod v5, v1, s[8:15], s[12:15]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mul_legacy_f32_e32 v5, v1, v2
+// GFX90A: error: e32 variant of this instruction is not supported
+
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// GFX90A: error: sdwa variant of this instruction is not supported
+
+v_mul_legacy_f32_dpp v5, v1, v2  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX90A: error: dpp variant of this instruction is not supported
+
+v_interp_p1_f32 v5, v1, attr0.x
+// GFX90A: error: instruction not supported on this GPU
+
+v_interp_p1_f32_e64 v5, v2, attr0.x
+// GFX90A: error: instruction not supported on this GPU
+
+v_interp_p2_f32 v5, v1, attr0.x
+// GFX90A: error: instruction not supported on this GPU
+
+v_interp_mov_f32 v5, p10, attr0.x
+// GFX90A: error: instruction not supported on this GPU
+
+v_interp_p1ll_f16 v5, v2, attr0.x
+// GFX90A: error: instruction not supported on this GPU
+
+v_interp_p1lv_f16 v5, v2, attr0.x, v3
+// GFX90A: error: instruction not supported on this GPU
+
+v_interp_p2_legacy_f16 v5, v2, attr0.x, v3
+// GFX90A: error: instruction not supported on this GPU
+
+v_interp_p2_f16 v5, v2, attr0.x, v3
+// GFX90A: error: instruction not supported on this GPU
+
+v_mov_b32_dpp v5, v1 row_share:1 row_mask:0x0 bank_mask:0x0
+// GFX90A: error: not a valid operand
+
+v_ceil_f64_dpp v[0:1], v[2:3] quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf
+// GFX90A: error: not a valid operand.
+
+v_ceil_f64_dpp v[0:1], v[2:3] row_shl:1 row_mask:0xf bank_mask:0xf
+// GFX90A: error: not a valid operand.
+
+v_ceil_f64_dpp v[0:1], v[2:3] wave_ror:1 row_mask:0xf bank_mask:0xf
+// GFX90A: error: not a valid operand.
+
+v_ceil_f64_dpp v[0:1], v[2:3] row_share:1 row_mask:0xf bank_mask:0xf
+// GFX90A: error: not a valid operand.
+
+flat_atomic_add v2, v[2:3], a2 glc
+// GFX90A: error: invalid register class: data and dst should be all VGPR or AGPR
+
+flat_atomic_add a2, v[2:3], v2 glc
+// GFX90A: error: invalid register class: data and dst should be all VGPR or AGPR
+
+tbuffer_store_format_xyzw v[0:3], off, s[4:7],  dfmt:15,  nfmt:2, s1 tfe
+// GFX90A: error: operands are not valid for this GPU or mode
+
+buffer_store_dwordx4 v[0:3], off, s[12:15], s4 offset:4095 glc tfe
+// GFX90A: error: operands are not valid for this GPU or mode
+
+ds_write2_b64 v1, a[4:5], v[2:3] offset1:255
+// GFX90A: error: invalid register class: data and dst should be all VGPR or AGPR
+
+ds_write2_b64 v1, v[4:5], a[2:3] offset1:255
+// GFX90A: error: invalid register class: data and dst should be all VGPR or AGPR
+
+ds_write2_b64 v1, a[4:5], v[2:3] offset1:255 gds
+// GFX90A: error: invalid register class: data and dst should be all VGPR or AGPR
+
+ds_write2_b64 v1, v[4:5], a[2:3] offset1:255 gds
+// GFX90A: error: invalid register class: data and dst should be all VGPR or AGPR
+
+ds_wrxchg2st64_rtn_b32 v[6:7], v1, a2, a3 offset0:127
+// GFX90A: error: invalid register class: data and dst should be all VGPR or AGPR
+
+image_load v[0:4], v2, s[0:7] dmask:0xf unorm tfe
+// GFX90A: error: operands are not valid for this GPU or mode
+
+image_sample_lz v[0:3], v[0:1], s[4:11], s[16:19] dmask:0xf
+// GFX90A: error: instruction not supported on this GPU
+
+image_sample_d v[0:3], v[0:1], s[4:11], s[16:19] dmask:0xf
+// GFX90A: error: instruction not supported on this GPU
+
+image_sample_o v[0:3], v[0:1], s[4:11], s[16:19] dmask:0xf
+// GFX90A: error: instruction not supported on this GPU
+
+image_sample_cl v[0:3], v[0:1], s[4:11], s[16:19] dmask:0xf
+// GFX90A: error: instruction not supported on this GPU
+
+image_sample_cd v[0:3], v[0:1], s[4:11], s[16:19] dmask:0xf
+// GFX90A: error: instruction not supported on this GPU
+
+image_sample_b v[0:3], v[0:1], s[4:11], s[16:19] dmask:0xf
+// GFX90A: error: instruction not supported on this GPU

diff  --git a/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s b/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
new file mode 100644
index 000000000000..dbc632b968a9
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
@@ -0,0 +1,11194 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx908 %s 2>&1 | FileCheck --check-prefix=NOT-GFX90A --implicit-check-not=error: %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -show-encoding %s | FileCheck --check-prefix=GFX90A %s
+
+// GFX90A: flat_load_ubyte a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte a5, v[2:3] offset:4095
+
+// GFX90A: flat_load_ubyte a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdc,0x02,0x00,0x80,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte a255, v[2:3] offset:4095
+
+// GFX90A: flat_load_ubyte a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdc,0xfe,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte a5, v[254:255] offset:4095
+
+// GFX90A: flat_load_ubyte a5, v[2:3]      ; encoding: [0x00,0x00,0x40,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte a5, v[2:3]
+
+// GFX90A: flat_load_ubyte a5, v[2:3]      ; encoding: [0x00,0x00,0x40,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte a5, v[2:3]
+
+// GFX90A: flat_load_ubyte a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x40,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte a5, v[2:3] offset:7
+
+// GFX90A: flat_load_ubyte a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x41,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte a5, v[2:3] offset:4095 glc
+
+// GFX90A: flat_load_ubyte a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x42,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte a5, v[2:3] offset:4095 slc
+
+// GFX90A: flat_load_sbyte a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte a5, v[2:3] offset:4095
+
+// GFX90A: flat_load_sbyte a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdc,0x02,0x00,0x80,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte a255, v[2:3] offset:4095
+
+// GFX90A: flat_load_sbyte a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdc,0xfe,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte a5, v[254:255] offset:4095
+
+// GFX90A: flat_load_sbyte a5, v[2:3]      ; encoding: [0x00,0x00,0x44,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte a5, v[2:3]
+
+// GFX90A: flat_load_sbyte a5, v[2:3]      ; encoding: [0x00,0x00,0x44,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte a5, v[2:3]
+
+// GFX90A: flat_load_sbyte a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x44,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte a5, v[2:3] offset:7
+
+// GFX90A: flat_load_sbyte a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x45,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte a5, v[2:3] offset:4095 glc
+
+// GFX90A: flat_load_sbyte a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x46,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte a5, v[2:3] offset:4095 slc
+
+// GFX90A: flat_load_ushort a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x48,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ushort a5, v[2:3] offset:4095
+
+// GFX90A: flat_load_ushort a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x48,0xdc,0x02,0x00,0x80,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ushort a255, v[2:3] offset:4095
+
+// GFX90A: flat_load_ushort a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x48,0xdc,0xfe,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ushort a5, v[254:255] offset:4095
+
+// GFX90A: flat_load_ushort a5, v[2:3]     ; encoding: [0x00,0x00,0x48,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ushort a5, v[2:3]
+
+// GFX90A: flat_load_ushort a5, v[2:3]     ; encoding: [0x00,0x00,0x48,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ushort a5, v[2:3]
+
+// GFX90A: flat_load_ushort a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x48,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ushort a5, v[2:3] offset:7
+
+// GFX90A: flat_load_ushort a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x49,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ushort a5, v[2:3] offset:4095 glc
+
+// GFX90A: flat_load_ushort a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x4a,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ushort a5, v[2:3] offset:4095 slc
+
+// GFX90A: flat_load_sshort a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x4c,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sshort a5, v[2:3] offset:4095
+
+// GFX90A: flat_load_sshort a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x4c,0xdc,0x02,0x00,0x80,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sshort a255, v[2:3] offset:4095
+
+// GFX90A: flat_load_sshort a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x4c,0xdc,0xfe,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sshort a5, v[254:255] offset:4095
+
+// GFX90A: flat_load_sshort a5, v[2:3]     ; encoding: [0x00,0x00,0x4c,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sshort a5, v[2:3]
+
+// GFX90A: flat_load_sshort a5, v[2:3]     ; encoding: [0x00,0x00,0x4c,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sshort a5, v[2:3]
+
+// GFX90A: flat_load_sshort a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x4c,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sshort a5, v[2:3] offset:7
+
+// GFX90A: flat_load_sshort a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x4d,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sshort a5, v[2:3] offset:4095 glc
+
+// GFX90A: flat_load_sshort a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x4e,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sshort a5, v[2:3] offset:4095 slc
+
+// GFX90A: flat_load_dword a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x50,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dword a5, v[2:3] offset:4095
+
+// GFX90A: flat_load_dword a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x50,0xdc,0x02,0x00,0x80,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dword a255, v[2:3] offset:4095
+
+// GFX90A: flat_load_dword a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x50,0xdc,0xfe,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dword a5, v[254:255] offset:4095
+
+// GFX90A: flat_load_dword a5, v[2:3]      ; encoding: [0x00,0x00,0x50,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dword a5, v[2:3]
+
+// GFX90A: flat_load_dword a5, v[2:3]      ; encoding: [0x00,0x00,0x50,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dword a5, v[2:3]
+
+// GFX90A: flat_load_dword a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x50,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dword a5, v[2:3] offset:7
+
+// GFX90A: flat_load_dword a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x51,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dword a5, v[2:3] offset:4095 glc
+
+// GFX90A: flat_load_dword a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x52,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dword a5, v[2:3] offset:4095 slc
+
+// GFX90A: flat_load_dwordx2 a[6:7], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x54,0xdc,0x02,0x00,0x80,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx2 a[6:7], v[2:3] offset:4095
+
+// GFX90A: flat_load_dwordx2 a[254:255], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x54,0xdc,0x02,0x00,0x80,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx2 a[254:255], v[2:3] offset:4095
+
+// GFX90A: flat_load_dwordx2 a[6:7], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x54,0xdc,0xfe,0x00,0x80,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx2 a[6:7], v[254:255] offset:4095
+
+// GFX90A: flat_load_dwordx2 a[6:7], v[2:3] ; encoding: [0x00,0x00,0x54,0xdc,0x02,0x00,0x80,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx2 a[6:7], v[2:3]
+
+// GFX90A: flat_load_dwordx2 a[6:7], v[2:3] ; encoding: [0x00,0x00,0x54,0xdc,0x02,0x00,0x80,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx2 a[6:7], v[2:3]
+
+// GFX90A: flat_load_dwordx2 a[6:7], v[2:3] offset:7 ; encoding: [0x07,0x00,0x54,0xdc,0x02,0x00,0x80,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx2 a[6:7], v[2:3] offset:7
+
+// GFX90A: flat_load_dwordx2 a[6:7], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x55,0xdc,0x02,0x00,0x80,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx2 a[6:7], v[2:3] offset:4095 glc
+
+// GFX90A: flat_load_dwordx2 a[6:7], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x56,0xdc,0x02,0x00,0x80,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx2 a[6:7], v[2:3] offset:4095 slc
+
+// GFX90A: flat_load_dwordx3 a[6:8], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x58,0xdc,0x02,0x00,0x80,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx3 a[6:8], v[2:3] offset:4095
+
+// GFX90A: flat_load_dwordx3 a[252:254], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x58,0xdc,0x02,0x00,0x80,0xfc]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx3 a[252:254], v[2:3] offset:4095
+
+// GFX90A: flat_load_dwordx3 a[6:8], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x58,0xdc,0xfe,0x00,0x80,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx3 a[6:8], v[254:255] offset:4095
+
+// GFX90A: flat_load_dwordx3 a[6:8], v[2:3] ; encoding: [0x00,0x00,0x58,0xdc,0x02,0x00,0x80,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx3 a[6:8], v[2:3]
+
+// GFX90A: flat_load_dwordx3 a[6:8], v[2:3] ; encoding: [0x00,0x00,0x58,0xdc,0x02,0x00,0x80,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx3 a[6:8], v[2:3]
+
+// GFX90A: flat_load_dwordx3 a[6:8], v[2:3] offset:7 ; encoding: [0x07,0x00,0x58,0xdc,0x02,0x00,0x80,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx3 a[6:8], v[2:3] offset:7
+
+// GFX90A: flat_load_dwordx3 a[6:8], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x59,0xdc,0x02,0x00,0x80,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx3 a[6:8], v[2:3] offset:4095 glc
+
+// GFX90A: flat_load_dwordx3 a[6:8], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x5a,0xdc,0x02,0x00,0x80,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx3 a[6:8], v[2:3] offset:4095 slc
+
+// GFX90A: flat_load_dwordx4 a[6:9], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x5c,0xdc,0x02,0x00,0x80,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx4 a[6:9], v[2:3] offset:4095
+
+// GFX90A: flat_load_dwordx4 a[252:255], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x5c,0xdc,0x02,0x00,0x80,0xfc]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx4 a[252:255], v[2:3] offset:4095
+
+// GFX90A: flat_load_dwordx4 a[6:9], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x5c,0xdc,0xfe,0x00,0x80,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx4 a[6:9], v[254:255] offset:4095
+
+// GFX90A: flat_load_dwordx4 a[6:9], v[2:3] ; encoding: [0x00,0x00,0x5c,0xdc,0x02,0x00,0x80,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx4 a[6:9], v[2:3]
+
+// GFX90A: flat_load_dwordx4 a[6:9], v[2:3] ; encoding: [0x00,0x00,0x5c,0xdc,0x02,0x00,0x80,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx4 a[6:9], v[2:3]
+
+// GFX90A: flat_load_dwordx4 a[6:9], v[2:3] offset:7 ; encoding: [0x07,0x00,0x5c,0xdc,0x02,0x00,0x80,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx4 a[6:9], v[2:3] offset:7
+
+// GFX90A: flat_load_dwordx4 a[6:9], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x5d,0xdc,0x02,0x00,0x80,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx4 a[6:9], v[2:3] offset:4095 glc
+
+// GFX90A: flat_load_dwordx4 a[6:9], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x5e,0xdc,0x02,0x00,0x80,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_dwordx4 a[6:9], v[2:3] offset:4095 slc
+
+// GFX90A: flat_store_byte v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x60,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_byte v[2:3], a2 offset:4095
+
+// GFX90A: flat_store_byte v[254:255], a2 offset:4095 ; encoding: [0xff,0x0f,0x60,0xdc,0xfe,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_byte v[254:255], a2 offset:4095
+
+// GFX90A: flat_store_byte v[2:3], a255 offset:4095 ; encoding: [0xff,0x0f,0x60,0xdc,0x02,0xff,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_byte v[2:3], a255 offset:4095
+
+// GFX90A: flat_store_byte v[2:3], a2      ; encoding: [0x00,0x00,0x60,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_byte v[2:3], a2
+
+// GFX90A: flat_store_byte v[2:3], a2      ; encoding: [0x00,0x00,0x60,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_byte v[2:3], a2
+
+// GFX90A: flat_store_byte v[2:3], a2 offset:7 ; encoding: [0x07,0x00,0x60,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_byte v[2:3], a2 offset:7
+
+// GFX90A: flat_store_byte v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x61,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_byte v[2:3], a2 offset:4095 glc
+
+// GFX90A: flat_store_byte v[2:3], a2 offset:4095 slc ; encoding: [0xff,0x0f,0x62,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_byte v[2:3], a2 offset:4095 slc
+
+// GFX90A: flat_store_byte_d16_hi v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x64,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_byte_d16_hi v[2:3], a2 offset:4095
+
+// GFX90A: flat_store_byte_d16_hi v[254:255], a2 offset:4095 ; encoding: [0xff,0x0f,0x64,0xdc,0xfe,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_byte_d16_hi v[254:255], a2 offset:4095
+
+// GFX90A: flat_store_byte_d16_hi v[2:3], a255 offset:4095 ; encoding: [0xff,0x0f,0x64,0xdc,0x02,0xff,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_byte_d16_hi v[2:3], a255 offset:4095
+
+// GFX90A: flat_store_byte_d16_hi v[2:3], a2 ; encoding: [0x00,0x00,0x64,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_byte_d16_hi v[2:3], a2
+
+// GFX90A: flat_store_byte_d16_hi v[2:3], a2 ; encoding: [0x00,0x00,0x64,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_byte_d16_hi v[2:3], a2
+
+// GFX90A: flat_store_byte_d16_hi v[2:3], a2 offset:7 ; encoding: [0x07,0x00,0x64,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_byte_d16_hi v[2:3], a2 offset:7
+
+// GFX90A: flat_store_byte_d16_hi v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x65,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_byte_d16_hi v[2:3], a2 offset:4095 glc
+
+// GFX90A: flat_store_byte_d16_hi v[2:3], a2 offset:4095 slc ; encoding: [0xff,0x0f,0x66,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_byte_d16_hi v[2:3], a2 offset:4095 slc
+
+// GFX90A: flat_store_short v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x68,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_short v[2:3], a2 offset:4095
+
+// GFX90A: flat_store_short v[254:255], a2 offset:4095 ; encoding: [0xff,0x0f,0x68,0xdc,0xfe,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_short v[254:255], a2 offset:4095
+
+// GFX90A: flat_store_short v[2:3], a255 offset:4095 ; encoding: [0xff,0x0f,0x68,0xdc,0x02,0xff,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_short v[2:3], a255 offset:4095
+
+// GFX90A: flat_store_short v[2:3], a2     ; encoding: [0x00,0x00,0x68,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_short v[2:3], a2
+
+// GFX90A: flat_store_short v[2:3], a2     ; encoding: [0x00,0x00,0x68,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_short v[2:3], a2
+
+// GFX90A: flat_store_short v[2:3], a2 offset:7 ; encoding: [0x07,0x00,0x68,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_short v[2:3], a2 offset:7
+
+// GFX90A: flat_store_short v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x69,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_short v[2:3], a2 offset:4095 glc
+
+// GFX90A: flat_store_short v[2:3], a2 offset:4095 slc ; encoding: [0xff,0x0f,0x6a,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_short v[2:3], a2 offset:4095 slc
+
+// GFX90A: flat_store_short_d16_hi v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_short_d16_hi v[2:3], a2 offset:4095
+
+// GFX90A: flat_store_short_d16_hi v[254:255], a2 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xdc,0xfe,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_short_d16_hi v[254:255], a2 offset:4095
+
+// GFX90A: flat_store_short_d16_hi v[2:3], a255 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xdc,0x02,0xff,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_short_d16_hi v[2:3], a255 offset:4095
+
+// GFX90A: flat_store_short_d16_hi v[2:3], a2 ; encoding: [0x00,0x00,0x6c,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_short_d16_hi v[2:3], a2
+
+// GFX90A: flat_store_short_d16_hi v[2:3], a2 ; encoding: [0x00,0x00,0x6c,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_short_d16_hi v[2:3], a2
+
+// GFX90A: flat_store_short_d16_hi v[2:3], a2 offset:7 ; encoding: [0x07,0x00,0x6c,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_short_d16_hi v[2:3], a2 offset:7
+
+// GFX90A: flat_store_short_d16_hi v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x6d,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_short_d16_hi v[2:3], a2 offset:4095 glc
+
+// GFX90A: flat_store_short_d16_hi v[2:3], a2 offset:4095 slc ; encoding: [0xff,0x0f,0x6e,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_short_d16_hi v[2:3], a2 offset:4095 slc
+
+// GFX90A: flat_store_dword v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x70,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dword v[2:3], a2 offset:4095
+
+// GFX90A: flat_store_dword v[254:255], a2 offset:4095 ; encoding: [0xff,0x0f,0x70,0xdc,0xfe,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dword v[254:255], a2 offset:4095
+
+// GFX90A: flat_store_dword v[2:3], a255 offset:4095 ; encoding: [0xff,0x0f,0x70,0xdc,0x02,0xff,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dword v[2:3], a255 offset:4095
+
+// GFX90A: flat_store_dword v[2:3], a2     ; encoding: [0x00,0x00,0x70,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dword v[2:3], a2
+
+// GFX90A: flat_store_dword v[2:3], a2     ; encoding: [0x00,0x00,0x70,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dword v[2:3], a2
+
+// GFX90A: flat_store_dword v[2:3], a2 offset:7 ; encoding: [0x07,0x00,0x70,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dword v[2:3], a2 offset:7
+
+// GFX90A: flat_store_dword v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x71,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dword v[2:3], a2 offset:4095 glc
+
+// GFX90A: flat_store_dword v[2:3], a2 offset:4095 slc ; encoding: [0xff,0x0f,0x72,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dword v[2:3], a2 offset:4095 slc
+
+// GFX90A: flat_store_dwordx2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x74,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx2 v[2:3], a[2:3] offset:4095
+
+// GFX90A: flat_store_dwordx2 v[254:255], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x74,0xdc,0xfe,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx2 v[254:255], a[2:3] offset:4095
+
+// GFX90A: flat_store_dwordx2 v[2:3], a[254:255] offset:4095 ; encoding: [0xff,0x0f,0x74,0xdc,0x02,0xfe,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx2 v[2:3], a[254:255] offset:4095
+
+// GFX90A: flat_store_dwordx2 v[2:3], a[2:3] ; encoding: [0x00,0x00,0x74,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx2 v[2:3], a[2:3]
+
+// GFX90A: flat_store_dwordx2 v[2:3], a[2:3] ; encoding: [0x00,0x00,0x74,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx2 v[2:3], a[2:3]
+
+// GFX90A: flat_store_dwordx2 v[2:3], a[2:3] offset:7 ; encoding: [0x07,0x00,0x74,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx2 v[2:3], a[2:3] offset:7
+
+// GFX90A: flat_store_dwordx2 v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x75,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx2 v[2:3], a[2:3] offset:4095 glc
+
+// GFX90A: flat_store_dwordx2 v[2:3], a[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x76,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx2 v[2:3], a[2:3] offset:4095 slc
+
+// GFX90A: flat_store_dwordx3 v[2:3], a[2:4] offset:4095 ; encoding: [0xff,0x0f,0x78,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx3 v[2:3], a[2:4] offset:4095
+
+// GFX90A: flat_store_dwordx3 v[254:255], a[2:4] offset:4095 ; encoding: [0xff,0x0f,0x78,0xdc,0xfe,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx3 v[254:255], a[2:4] offset:4095
+
+// GFX90A: flat_store_dwordx3 v[2:3], a[252:254] offset:4095 ; encoding: [0xff,0x0f,0x78,0xdc,0x02,0xfc,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx3 v[2:3], a[252:254] offset:4095
+
+// GFX90A: flat_store_dwordx3 v[2:3], a[2:4] ; encoding: [0x00,0x00,0x78,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx3 v[2:3], a[2:4]
+
+// GFX90A: flat_store_dwordx3 v[2:3], a[2:4] ; encoding: [0x00,0x00,0x78,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx3 v[2:3], a[2:4]
+
+// GFX90A: flat_store_dwordx3 v[2:3], a[2:4] offset:7 ; encoding: [0x07,0x00,0x78,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx3 v[2:3], a[2:4] offset:7
+
+// GFX90A: flat_store_dwordx3 v[2:3], a[2:4] offset:4095 glc ; encoding: [0xff,0x0f,0x79,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx3 v[2:3], a[2:4] offset:4095 glc
+
+// GFX90A: flat_store_dwordx3 v[2:3], a[2:4] offset:4095 slc ; encoding: [0xff,0x0f,0x7a,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx3 v[2:3], a[2:4] offset:4095 slc
+
+// GFX90A: flat_store_dwordx4 v[2:3], a[2:5] offset:4095 ; encoding: [0xff,0x0f,0x7c,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx4 v[2:3], a[2:5] offset:4095
+
+// GFX90A: flat_store_dwordx4 v[254:255], a[2:5] offset:4095 ; encoding: [0xff,0x0f,0x7c,0xdc,0xfe,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx4 v[254:255], a[2:5] offset:4095
+
+// GFX90A: flat_store_dwordx4 v[2:3], a[252:255] offset:4095 ; encoding: [0xff,0x0f,0x7c,0xdc,0x02,0xfc,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx4 v[2:3], a[252:255] offset:4095
+
+// GFX90A: flat_store_dwordx4 v[2:3], a[2:5] ; encoding: [0x00,0x00,0x7c,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx4 v[2:3], a[2:5]
+
+// GFX90A: flat_store_dwordx4 v[2:3], a[2:5] ; encoding: [0x00,0x00,0x7c,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx4 v[2:3], a[2:5]
+
+// GFX90A: flat_store_dwordx4 v[2:3], a[2:5] offset:7 ; encoding: [0x07,0x00,0x7c,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx4 v[2:3], a[2:5] offset:7
+
+// GFX90A: flat_store_dwordx4 v[2:3], a[2:5] offset:4095 glc ; encoding: [0xff,0x0f,0x7d,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx4 v[2:3], a[2:5] offset:4095 glc
+
+// GFX90A: flat_store_dwordx4 v[2:3], a[2:5] offset:4095 slc ; encoding: [0xff,0x0f,0x7e,0xdc,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_store_dwordx4 v[2:3], a[2:5] offset:4095 slc
+
+// GFX90A: flat_load_ubyte_d16 a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x80,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte_d16 a5, v[2:3] offset:4095
+
+// GFX90A: flat_load_ubyte_d16 a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x80,0xdc,0x02,0x00,0x80,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte_d16 a255, v[2:3] offset:4095
+
+// GFX90A: flat_load_ubyte_d16 a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x80,0xdc,0xfe,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte_d16 a5, v[254:255] offset:4095
+
+// GFX90A: flat_load_ubyte_d16 a5, v[2:3]  ; encoding: [0x00,0x00,0x80,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte_d16 a5, v[2:3]
+
+// GFX90A: flat_load_ubyte_d16 a5, v[2:3]  ; encoding: [0x00,0x00,0x80,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte_d16 a5, v[2:3]
+
+// GFX90A: flat_load_ubyte_d16 a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x80,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte_d16 a5, v[2:3] offset:7
+
+// GFX90A: flat_load_ubyte_d16 a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x81,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte_d16 a5, v[2:3] offset:4095 glc
+
+// GFX90A: flat_load_ubyte_d16 a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x82,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte_d16 a5, v[2:3] offset:4095 slc
+
+// GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x84,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte_d16_hi a5, v[2:3] offset:4095
+
+// GFX90A: flat_load_ubyte_d16_hi a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x84,0xdc,0x02,0x00,0x80,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte_d16_hi a255, v[2:3] offset:4095
+
+// GFX90A: flat_load_ubyte_d16_hi a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x84,0xdc,0xfe,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte_d16_hi a5, v[254:255] offset:4095
+
+// GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x84,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte_d16_hi a5, v[2:3]
+
+// GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x84,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte_d16_hi a5, v[2:3]
+
+// GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x84,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte_d16_hi a5, v[2:3] offset:7
+
+// GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x85,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte_d16_hi a5, v[2:3] offset:4095 glc
+
+// GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x86,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_ubyte_d16_hi a5, v[2:3] offset:4095 slc
+
+// GFX90A: flat_load_sbyte_d16 a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x88,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte_d16 a5, v[2:3] offset:4095
+
+// GFX90A: flat_load_sbyte_d16 a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x88,0xdc,0x02,0x00,0x80,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte_d16 a255, v[2:3] offset:4095
+
+// GFX90A: flat_load_sbyte_d16 a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x88,0xdc,0xfe,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte_d16 a5, v[254:255] offset:4095
+
+// GFX90A: flat_load_sbyte_d16 a5, v[2:3]  ; encoding: [0x00,0x00,0x88,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte_d16 a5, v[2:3]
+
+// GFX90A: flat_load_sbyte_d16 a5, v[2:3]  ; encoding: [0x00,0x00,0x88,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte_d16 a5, v[2:3]
+
+// GFX90A: flat_load_sbyte_d16 a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x88,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte_d16 a5, v[2:3] offset:7
+
+// GFX90A: flat_load_sbyte_d16 a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x89,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte_d16 a5, v[2:3] offset:4095 glc
+
+// GFX90A: flat_load_sbyte_d16 a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x8a,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte_d16 a5, v[2:3] offset:4095 slc
+
+// GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x8c,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte_d16_hi a5, v[2:3] offset:4095
+
+// GFX90A: flat_load_sbyte_d16_hi a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x8c,0xdc,0x02,0x00,0x80,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte_d16_hi a255, v[2:3] offset:4095
+
+// GFX90A: flat_load_sbyte_d16_hi a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x8c,0xdc,0xfe,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte_d16_hi a5, v[254:255] offset:4095
+
+// GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x8c,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte_d16_hi a5, v[2:3]
+
+// GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x8c,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte_d16_hi a5, v[2:3]
+
+// GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x8c,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte_d16_hi a5, v[2:3] offset:7
+
+// GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x8d,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte_d16_hi a5, v[2:3] offset:4095 glc
+
+// GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x8e,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_sbyte_d16_hi a5, v[2:3] offset:4095 slc
+
+// GFX90A: flat_load_short_d16 a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x90,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_short_d16 a5, v[2:3] offset:4095
+
+// GFX90A: flat_load_short_d16 a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x90,0xdc,0x02,0x00,0x80,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_short_d16 a255, v[2:3] offset:4095
+
+// GFX90A: flat_load_short_d16 a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x90,0xdc,0xfe,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_short_d16 a5, v[254:255] offset:4095
+
+// GFX90A: flat_load_short_d16 a5, v[2:3]  ; encoding: [0x00,0x00,0x90,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_short_d16 a5, v[2:3]
+
+// GFX90A: flat_load_short_d16 a5, v[2:3]  ; encoding: [0x00,0x00,0x90,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_short_d16 a5, v[2:3]
+
+// GFX90A: flat_load_short_d16 a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x90,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_short_d16 a5, v[2:3] offset:7
+
+// GFX90A: flat_load_short_d16 a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x91,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_short_d16 a5, v[2:3] offset:4095 glc
+
+// GFX90A: flat_load_short_d16 a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x92,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_short_d16 a5, v[2:3] offset:4095 slc
+
+// GFX90A: flat_load_short_d16_hi a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x94,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_short_d16_hi a5, v[2:3] offset:4095
+
+// GFX90A: flat_load_short_d16_hi a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x94,0xdc,0x02,0x00,0x80,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_short_d16_hi a255, v[2:3] offset:4095
+
+// GFX90A: flat_load_short_d16_hi a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x94,0xdc,0xfe,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_short_d16_hi a5, v[254:255] offset:4095
+
+// GFX90A: flat_load_short_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x94,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_short_d16_hi a5, v[2:3]
+
+// GFX90A: flat_load_short_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x94,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_short_d16_hi a5, v[2:3]
+
+// GFX90A: flat_load_short_d16_hi a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x94,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_short_d16_hi a5, v[2:3] offset:7
+
+// GFX90A: flat_load_short_d16_hi a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x95,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_short_d16_hi a5, v[2:3] offset:4095 glc
+
+// GFX90A: flat_load_short_d16_hi a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x96,0xdc,0x02,0x00,0x80,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_load_short_d16_hi a5, v[2:3] offset:4095 slc
+
+// GFX90A: flat_atomic_swap a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x01,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_swap a0, v[2:3], a2 offset:4095 glc
+
+// GFX90A: flat_atomic_cmpswap a0, v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x05,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_cmpswap a0, v[2:3], a[2:3] offset:4095 glc
+
+// GFX90A: flat_atomic_add a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x09,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_add a0, v[2:3], a2 offset:4095 glc
+
+// GFX90A: flat_atomic_sub a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x0d,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_sub a0, v[2:3], a2 offset:4095 glc
+
+// GFX90A: flat_atomic_smin a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x11,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_smin a0, v[2:3], a2 offset:4095 glc
+
+// GFX90A: flat_atomic_umin a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x15,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_umin a0, v[2:3], a2 offset:4095 glc
+
+// GFX90A: flat_atomic_smax a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x19,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_smax a0, v[2:3], a2 offset:4095 glc
+
+// GFX90A: flat_atomic_umax a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x1d,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_umax a0, v[2:3], a2 offset:4095 glc
+
+// GFX90A: flat_atomic_and a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x21,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_and a0, v[2:3], a2 offset:4095 glc
+
+// GFX90A: flat_atomic_or a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x25,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_or a0, v[2:3], a2 offset:4095 glc
+
+// GFX90A: flat_atomic_xor a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x29,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_xor a0, v[2:3], a2 offset:4095 glc
+
+// GFX90A: flat_atomic_inc a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x2d,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_inc a0, v[2:3], a2 offset:4095 glc
+
+// GFX90A: flat_atomic_dec a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x31,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_dec a0, v[2:3], a2 offset:4095 glc
+
+// GFX90A: flat_atomic_swap_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x81,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_swap_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
+
+// GFX90A: flat_atomic_cmpswap_x2 a[0:1], v[2:3], a[2:5] offset:4095 glc ; encoding: [0xff,0x0f,0x85,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_cmpswap_x2 a[0:1], v[2:3], a[2:5] offset:4095 glc
+
+// GFX90A: flat_atomic_add_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x89,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_add_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
+
+// GFX90A: flat_atomic_sub_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x8d,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_sub_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
+
+// GFX90A: flat_atomic_smin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x91,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_smin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
+
+// GFX90A: flat_atomic_umin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x95,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_umin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
+
+// GFX90A: flat_atomic_smax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x99,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_smax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
+
+// GFX90A: flat_atomic_umax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x9d,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_umax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
+
+// GFX90A: flat_atomic_and_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa1,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_and_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
+
+// GFX90A: flat_atomic_or_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa5,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_or_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
+
+// GFX90A: flat_atomic_xor_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa9,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_xor_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
+
+// GFX90A: flat_atomic_inc_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xad,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_inc_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
+
+// GFX90A: flat_atomic_dec_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xb1,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_dec_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
+
+// GFX90A: flat_atomic_swap v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x00,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_swap v[2:3], a2 offset:4095
+
+// GFX90A: flat_atomic_cmpswap v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x04,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_cmpswap v[2:3], a[2:3] offset:4095
+
+// GFX90A: flat_atomic_add v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x08,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_add v[2:3], a2 offset:4095
+
+// GFX90A: flat_atomic_sub v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_sub v[2:3], a2 offset:4095
+
+// GFX90A: flat_atomic_smin v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x10,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_smin v[2:3], a2 offset:4095
+
+// GFX90A: flat_atomic_umin v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x14,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_umin v[2:3], a2 offset:4095
+
+// GFX90A: flat_atomic_smax v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x18,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_smax v[2:3], a2 offset:4095
+
+// GFX90A: flat_atomic_umax v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_umax v[2:3], a2 offset:4095
+
+// GFX90A: flat_atomic_and v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x20,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_and v[2:3], a2 offset:4095
+
+// GFX90A: flat_atomic_or v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x24,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_or v[2:3], a2 offset:4095
+
+// GFX90A: flat_atomic_xor v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x28,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_xor v[2:3], a2 offset:4095
+
+// GFX90A: flat_atomic_inc v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_inc v[2:3], a2 offset:4095
+
+// GFX90A: flat_atomic_dec v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x30,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_dec v[2:3], a2 offset:4095
+
+// GFX90A: flat_atomic_swap_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x80,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_swap_x2 v[2:3], a[2:3] offset:4095
+
+// GFX90A: flat_atomic_cmpswap_x2 v[2:3], a[2:5] offset:4095 ; encoding: [0xff,0x0f,0x84,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_cmpswap_x2 v[2:3], a[2:5] offset:4095
+
+// GFX90A: flat_atomic_add_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x88,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_add_x2 v[2:3], a[2:3] offset:4095
+
+// GFX90A: flat_atomic_sub_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x8c,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_sub_x2 v[2:3], a[2:3] offset:4095
+
+// GFX90A: flat_atomic_smin_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x90,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_smin_x2 v[2:3], a[2:3] offset:4095
+
+// GFX90A: flat_atomic_umin_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x94,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_umin_x2 v[2:3], a[2:3] offset:4095
+
+// GFX90A: flat_atomic_smax_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x98,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_smax_x2 v[2:3], a[2:3] offset:4095
+
+// GFX90A: flat_atomic_umax_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x9c,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_umax_x2 v[2:3], a[2:3] offset:4095
+
+// GFX90A: flat_atomic_and_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0xa0,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_and_x2 v[2:3], a[2:3] offset:4095
+
+// GFX90A: flat_atomic_or_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0xa4,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_or_x2 v[2:3], a[2:3] offset:4095
+
+// GFX90A: flat_atomic_xor_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0xa8,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_xor_x2 v[2:3], a[2:3] offset:4095
+
+// GFX90A: flat_atomic_inc_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0xac,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_inc_x2 v[2:3], a[2:3] offset:4095
+
+// GFX90A: flat_atomic_dec_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0xb0,0xdd,0x02,0x02,0x80,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+flat_atomic_dec_x2 v[2:3], a[2:3] offset:4095
+
+// GFX90A: global_load_ubyte a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x40,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_ubyte a5, v[2:3], off offset:-1
+
+// GFX90A: global_load_ubyte a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x40,0xdc,0x02,0x00,0xff,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_ubyte a255, v[2:3], off offset:-1
+
+// GFX90A: global_load_ubyte a5, v[2:3], off ; encoding: [0x00,0x80,0x40,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_ubyte a5, v[2:3], off
+
+// GFX90A: global_load_sbyte a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x44,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_sbyte a5, v[2:3], off offset:-1
+
+// GFX90A: global_load_sbyte a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x44,0xdc,0x02,0x00,0xff,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_sbyte a255, v[2:3], off offset:-1
+
+// GFX90A: global_load_sbyte a5, v[2:3], off ; encoding: [0x00,0x80,0x44,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_sbyte a5, v[2:3], off
+
+// GFX90A: global_load_ushort a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x48,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_ushort a5, v[2:3], off offset:-1
+
+// GFX90A: global_load_ushort a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x48,0xdc,0x02,0x00,0xff,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_ushort a255, v[2:3], off offset:-1
+
+// GFX90A: global_load_ushort a5, v[2:3], off ; encoding: [0x00,0x80,0x48,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_ushort a5, v[2:3], off
+
+// GFX90A: global_load_sshort a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x4c,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_sshort a5, v[2:3], off offset:-1
+
+// GFX90A: global_load_sshort a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x4c,0xdc,0x02,0x00,0xff,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_sshort a255, v[2:3], off offset:-1
+
+// GFX90A: global_load_sshort a5, v[2:3], off ; encoding: [0x00,0x80,0x4c,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_sshort a5, v[2:3], off
+
+// GFX90A: global_load_dword a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x50,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_dword a5, v[2:3], off offset:-1
+
+// GFX90A: global_load_dword a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x50,0xdc,0x02,0x00,0xff,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_dword a255, v[2:3], off offset:-1
+
+// GFX90A: global_load_dword a5, v[2:3], off ; encoding: [0x00,0x80,0x50,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_dword a5, v[2:3], off
+
+// GFX90A: global_load_dwordx2 a[6:7], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x54,0xdc,0x02,0x00,0xff,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_dwordx2 a[6:7], v[2:3], off offset:-1
+
+// GFX90A: global_load_dwordx2 a[254:255], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x54,0xdc,0x02,0x00,0xff,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_dwordx2 a[254:255], v[2:3], off offset:-1
+
+// GFX90A: global_load_dwordx2 a[6:7], v[2:3], off ; encoding: [0x00,0x80,0x54,0xdc,0x02,0x00,0xff,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_dwordx2 a[6:7], v[2:3], off
+
+// GFX90A: global_load_dwordx3 a[6:8], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x58,0xdc,0x02,0x00,0xff,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_dwordx3 a[6:8], v[2:3], off offset:-1
+
+// GFX90A: global_load_dwordx3 a[252:254], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x58,0xdc,0x02,0x00,0xff,0xfc]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_dwordx3 a[252:254], v[2:3], off offset:-1
+
+// GFX90A: global_load_dwordx3 a[6:8], v[2:3], off ; encoding: [0x00,0x80,0x58,0xdc,0x02,0x00,0xff,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_dwordx3 a[6:8], v[2:3], off
+
+// GFX90A: global_load_dwordx4 a[6:9], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x5c,0xdc,0x02,0x00,0xff,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_dwordx4 a[6:9], v[2:3], off offset:-1
+
+// GFX90A: global_load_dwordx4 a[252:255], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x5c,0xdc,0x02,0x00,0xff,0xfc]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_dwordx4 a[252:255], v[2:3], off offset:-1
+
+// GFX90A: global_load_dwordx4 a[6:9], v[2:3], off ; encoding: [0x00,0x80,0x5c,0xdc,0x02,0x00,0xff,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_dwordx4 a[6:9], v[2:3], off
+
+// GFX90A: global_store_byte v[2:3], a2, off offset:-1 ; encoding: [0xff,0x9f,0x60,0xdc,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_byte v[2:3], a2, off offset:-1
+
+// GFX90A: global_store_byte v[2:3], a255, off offset:-1 ; encoding: [0xff,0x9f,0x60,0xdc,0x02,0xff,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_byte v[2:3], a255, off offset:-1
+
+// GFX90A: global_store_byte v[2:3], a2, off ; encoding: [0x00,0x80,0x60,0xdc,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_byte v[2:3], a2, off
+
+// GFX90A: global_store_byte_d16_hi v[2:3], a2, off offset:-1 ; encoding: [0xff,0x9f,0x64,0xdc,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_byte_d16_hi v[2:3], a2, off offset:-1
+
+// GFX90A: global_store_byte_d16_hi v[2:3], a255, off offset:-1 ; encoding: [0xff,0x9f,0x64,0xdc,0x02,0xff,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_byte_d16_hi v[2:3], a255, off offset:-1
+
+// GFX90A: global_store_byte_d16_hi v[2:3], a2, off ; encoding: [0x00,0x80,0x64,0xdc,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_byte_d16_hi v[2:3], a2, off
+
+// GFX90A: global_store_short v[2:3], a2, off offset:-1 ; encoding: [0xff,0x9f,0x68,0xdc,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_short v[2:3], a2, off offset:-1
+
+// GFX90A: global_store_short v[2:3], a255, off offset:-1 ; encoding: [0xff,0x9f,0x68,0xdc,0x02,0xff,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_short v[2:3], a255, off offset:-1
+
+// GFX90A: global_store_short v[2:3], a2, off ; encoding: [0x00,0x80,0x68,0xdc,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_short v[2:3], a2, off
+
+// GFX90A: global_store_short_d16_hi v[2:3], a2, off offset:-1 ; encoding: [0xff,0x9f,0x6c,0xdc,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_short_d16_hi v[2:3], a2, off offset:-1
+
+// GFX90A: global_store_short_d16_hi v[2:3], a255, off offset:-1 ; encoding: [0xff,0x9f,0x6c,0xdc,0x02,0xff,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_short_d16_hi v[2:3], a255, off offset:-1
+
+// GFX90A: global_store_short_d16_hi v[2:3], a2, off ; encoding: [0x00,0x80,0x6c,0xdc,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_short_d16_hi v[2:3], a2, off
+
+// GFX90A: global_store_dword v[2:3], a2, off offset:-1 ; encoding: [0xff,0x9f,0x70,0xdc,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_dword v[2:3], a2, off offset:-1
+
+// GFX90A: global_store_dword v[2:3], a255, off offset:-1 ; encoding: [0xff,0x9f,0x70,0xdc,0x02,0xff,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_dword v[2:3], a255, off offset:-1
+
+// GFX90A: global_store_dword v[2:3], a2, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_dword v[2:3], a2, off
+
+// GFX90A: global_store_dwordx2 v[2:3], a[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x74,0xdc,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_dwordx2 v[2:3], a[2:3], off offset:-1
+
+// GFX90A: global_store_dwordx2 v[2:3], a[254:255], off offset:-1 ; encoding: [0xff,0x9f,0x74,0xdc,0x02,0xfe,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_dwordx2 v[2:3], a[254:255], off offset:-1
+
+// GFX90A: global_store_dwordx2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_dwordx2 v[2:3], a[2:3], off
+
+// GFX90A: global_store_dwordx3 v[2:3], a[2:4], off offset:-1 ; encoding: [0xff,0x9f,0x78,0xdc,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_dwordx3 v[2:3], a[2:4], off offset:-1
+
+// GFX90A: global_store_dwordx3 v[2:3], a[252:254], off offset:-1 ; encoding: [0xff,0x9f,0x78,0xdc,0x02,0xfc,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_dwordx3 v[2:3], a[252:254], off offset:-1
+
+// GFX90A: global_store_dwordx3 v[2:3], a[2:4], off ; encoding: [0x00,0x80,0x78,0xdc,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_dwordx3 v[2:3], a[2:4], off
+
+// GFX90A: global_store_dwordx4 v[2:3], a[2:5], off offset:-1 ; encoding: [0xff,0x9f,0x7c,0xdc,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_dwordx4 v[2:3], a[2:5], off offset:-1
+
+// GFX90A: global_store_dwordx4 v[2:3], a[252:255], off offset:-1 ; encoding: [0xff,0x9f,0x7c,0xdc,0x02,0xfc,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_dwordx4 v[2:3], a[252:255], off offset:-1
+
+// GFX90A: global_store_dwordx4 v[2:3], a[2:5], off ; encoding: [0x00,0x80,0x7c,0xdc,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_store_dwordx4 v[2:3], a[2:5], off
+
+// GFX90A: global_load_ubyte_d16 a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x80,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_ubyte_d16 a5, v[2:3], off offset:-1
+
+// GFX90A: global_load_ubyte_d16 a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x80,0xdc,0x02,0x00,0xff,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_ubyte_d16 a255, v[2:3], off offset:-1
+
+// GFX90A: global_load_ubyte_d16 a5, v[2:3], off ; encoding: [0x00,0x80,0x80,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_ubyte_d16 a5, v[2:3], off
+
+// GFX90A: global_load_ubyte_d16_hi a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x84,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_ubyte_d16_hi a5, v[2:3], off offset:-1
+
+// GFX90A: global_load_ubyte_d16_hi a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x84,0xdc,0x02,0x00,0xff,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_ubyte_d16_hi a255, v[2:3], off offset:-1
+
+// GFX90A: global_load_ubyte_d16_hi a5, v[2:3], off ; encoding: [0x00,0x80,0x84,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_ubyte_d16_hi a5, v[2:3], off
+
+// GFX90A: global_load_sbyte_d16 a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x88,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_sbyte_d16 a5, v[2:3], off offset:-1
+
+// GFX90A: global_load_sbyte_d16 a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x88,0xdc,0x02,0x00,0xff,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_sbyte_d16 a255, v[2:3], off offset:-1
+
+// GFX90A: global_load_sbyte_d16 a5, v[2:3], off ; encoding: [0x00,0x80,0x88,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_sbyte_d16 a5, v[2:3], off
+
+// GFX90A: global_load_sbyte_d16_hi a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x8c,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_sbyte_d16_hi a5, v[2:3], off offset:-1
+
+// GFX90A: global_load_sbyte_d16_hi a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x8c,0xdc,0x02,0x00,0xff,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_sbyte_d16_hi a255, v[2:3], off offset:-1
+
+// GFX90A: global_load_sbyte_d16_hi a5, v[2:3], off ; encoding: [0x00,0x80,0x8c,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_sbyte_d16_hi a5, v[2:3], off
+
+// GFX90A: global_load_short_d16 a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x90,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_short_d16 a5, v[2:3], off offset:-1
+
+// GFX90A: global_load_short_d16 a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x90,0xdc,0x02,0x00,0xff,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_short_d16 a255, v[2:3], off offset:-1
+
+// GFX90A: global_load_short_d16 a5, v[2:3], off ; encoding: [0x00,0x80,0x90,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_short_d16 a5, v[2:3], off
+
+// GFX90A: global_load_short_d16_hi a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x94,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_short_d16_hi a5, v[2:3], off offset:-1
+
+// GFX90A: global_load_short_d16_hi a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x94,0xdc,0x02,0x00,0xff,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_short_d16_hi a255, v[2:3], off offset:-1
+
+// GFX90A: global_load_short_d16_hi a5, v[2:3], off ; encoding: [0x00,0x80,0x94,0xdc,0x02,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_load_short_d16_hi a5, v[2:3], off
+
+// GFX90A: global_atomic_swap a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x01,0xdd,0x02,0x02,0xff,0x01]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_swap a1, v[2:3], a2, off glc
+
+// GFX90A: global_atomic_cmpswap a1, v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x05,0xdd,0x02,0x02,0xff,0x01]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_cmpswap a1, v[2:3], a[2:3], off glc
+
+// GFX90A: global_atomic_add a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x09,0xdd,0x02,0x02,0xff,0x01]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_add a1, v[2:3], a2, off glc
+
+// GFX90A: global_atomic_sub a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x0d,0xdd,0x02,0x02,0xff,0x01]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_sub a1, v[2:3], a2, off glc
+
+// GFX90A: global_atomic_smin a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x11,0xdd,0x02,0x02,0xff,0x01]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_smin a1, v[2:3], a2, off glc
+
+// GFX90A: global_atomic_umin a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x15,0xdd,0x02,0x02,0xff,0x01]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_umin a1, v[2:3], a2, off glc
+
+// GFX90A: global_atomic_smax a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x19,0xdd,0x02,0x02,0xff,0x01]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_smax a1, v[2:3], a2, off glc
+
+// GFX90A: global_atomic_umax a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x1d,0xdd,0x02,0x02,0xff,0x01]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_umax a1, v[2:3], a2, off glc
+
+// GFX90A: global_atomic_and a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x21,0xdd,0x02,0x02,0xff,0x01]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_and a1, v[2:3], a2, off glc
+
+// GFX90A: global_atomic_or a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x25,0xdd,0x02,0x02,0xff,0x01]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_or a1, v[2:3], a2, off glc
+
+// GFX90A: global_atomic_xor a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x29,0xdd,0x02,0x02,0xff,0x01]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_xor a1, v[2:3], a2, off glc
+
+// GFX90A: global_atomic_inc a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x2d,0xdd,0x02,0x02,0xff,0x01]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_inc a1, v[2:3], a2, off glc
+
+// GFX90A: global_atomic_dec a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x31,0xdd,0x02,0x02,0xff,0x01]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_dec a1, v[2:3], a2, off glc
+
+// GFX90A: global_atomic_swap_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x81,0xdd,0x02,0x02,0xff,0x02]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_swap_x2 a[2:3], v[2:3], a[2:3], off glc
+
+// GFX90A: global_atomic_cmpswap_x2 a[2:3], v[2:3], a[2:5], off glc ; encoding: [0x00,0x80,0x85,0xdd,0x02,0x02,0xff,0x02]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_cmpswap_x2 a[2:3], v[2:3], a[2:5], off glc
+
+// GFX90A: global_atomic_add_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x89,0xdd,0x02,0x02,0xff,0x02]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_add_x2 a[2:3], v[2:3], a[2:3], off glc
+
+// GFX90A: global_atomic_sub_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x8d,0xdd,0x02,0x02,0xff,0x02]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_sub_x2 a[2:3], v[2:3], a[2:3], off glc
+
+// GFX90A: global_atomic_smin_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x91,0xdd,0x02,0x02,0xff,0x02]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_smin_x2 a[2:3], v[2:3], a[2:3], off glc
+
+// GFX90A: global_atomic_umin_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x95,0xdd,0x02,0x02,0xff,0x02]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_umin_x2 a[2:3], v[2:3], a[2:3], off glc
+
+// GFX90A: global_atomic_smax_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x99,0xdd,0x02,0x02,0xff,0x02]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_smax_x2 a[2:3], v[2:3], a[2:3], off glc
+
+// GFX90A: global_atomic_umax_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x9d,0xdd,0x02,0x02,0xff,0x02]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_umax_x2 a[2:3], v[2:3], a[2:3], off glc
+
+// GFX90A: global_atomic_and_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0xa1,0xdd,0x02,0x02,0xff,0x02]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_and_x2 a[2:3], v[2:3], a[2:3], off glc
+
+// GFX90A: global_atomic_or_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0xa5,0xdd,0x02,0x02,0xff,0x02]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_or_x2 a[2:3], v[2:3], a[2:3], off glc
+
+// GFX90A: global_atomic_xor_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0xa9,0xdd,0x02,0x02,0xff,0x02]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_xor_x2 a[2:3], v[2:3], a[2:3], off glc
+
+// GFX90A: global_atomic_inc_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0xad,0xdd,0x02,0x02,0xff,0x02]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_inc_x2 a[2:3], v[2:3], a[2:3], off glc
+
+// GFX90A: global_atomic_dec_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0xb1,0xdd,0x02,0x02,0xff,0x02]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_dec_x2 a[2:3], v[2:3], a[2:3], off glc
+
+// GFX90A: global_atomic_swap v[2:3], a2, off ; encoding: [0x00,0x80,0x00,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_swap v[2:3], a2, off
+
+// GFX90A: global_atomic_cmpswap v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x04,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_cmpswap v[2:3], a[2:3], off
+
+// GFX90A: global_atomic_add v[2:3], a2, off ; encoding: [0x00,0x80,0x08,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_add v[2:3], a2, off
+
+// GFX90A: global_atomic_sub v[2:3], a2, off ; encoding: [0x00,0x80,0x0c,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_sub v[2:3], a2, off
+
+// GFX90A: global_atomic_smin v[2:3], a2, off ; encoding: [0x00,0x80,0x10,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_smin v[2:3], a2, off
+
+// GFX90A: global_atomic_umin v[2:3], a2, off ; encoding: [0x00,0x80,0x14,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_umin v[2:3], a2, off
+
+// GFX90A: global_atomic_smax v[2:3], a2, off ; encoding: [0x00,0x80,0x18,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_smax v[2:3], a2, off
+
+// GFX90A: global_atomic_umax v[2:3], a2, off ; encoding: [0x00,0x80,0x1c,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_umax v[2:3], a2, off
+
+// GFX90A: global_atomic_and v[2:3], a2, off ; encoding: [0x00,0x80,0x20,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_and v[2:3], a2, off
+
+// GFX90A: global_atomic_or v[2:3], a2, off ; encoding: [0x00,0x80,0x24,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_or v[2:3], a2, off
+
+// GFX90A: global_atomic_xor v[2:3], a2, off ; encoding: [0x00,0x80,0x28,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_xor v[2:3], a2, off
+
+// GFX90A: global_atomic_inc v[2:3], a2, off ; encoding: [0x00,0x80,0x2c,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_inc v[2:3], a2, off
+
+// GFX90A: global_atomic_dec v[2:3], a2, off ; encoding: [0x00,0x80,0x30,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_dec v[2:3], a2, off
+
+// GFX90A: global_atomic_swap_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x80,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_swap_x2 v[2:3], a[2:3], off
+
+// GFX90A: global_atomic_cmpswap_x2 v[2:3], a[2:5], off ; encoding: [0x00,0x80,0x84,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_cmpswap_x2 v[2:3], a[2:5], off
+
+// GFX90A: global_atomic_add_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x88,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_add_x2 v[2:3], a[2:3], off
+
+// GFX90A: global_atomic_sub_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x8c,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_sub_x2 v[2:3], a[2:3], off
+
+// GFX90A: global_atomic_smin_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x90,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_smin_x2 v[2:3], a[2:3], off
+
+// GFX90A: global_atomic_umin_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x94,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_umin_x2 v[2:3], a[2:3], off
+
+// GFX90A: global_atomic_smax_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x98,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_smax_x2 v[2:3], a[2:3], off
+
+// GFX90A: global_atomic_umax_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x9c,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_umax_x2 v[2:3], a[2:3], off
+
+// GFX90A: global_atomic_and_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0xa0,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_and_x2 v[2:3], a[2:3], off
+
+// GFX90A: global_atomic_or_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0xa4,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_or_x2 v[2:3], a[2:3], off
+
+// GFX90A: global_atomic_xor_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0xa8,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_xor_x2 v[2:3], a[2:3], off
+
+// GFX90A: global_atomic_inc_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0xac,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_inc_x2 v[2:3], a[2:3], off
+
+// GFX90A: global_atomic_dec_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0xb0,0xdd,0x02,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+global_atomic_dec_x2 v[2:3], a[2:3], off
+
+// GFX90A: scratch_load_ubyte a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte a5, off, s2 offset:-1
+
+// GFX90A: scratch_load_ubyte a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte a255, off, s2 offset:-1
+
+// GFX90A: scratch_load_ubyte a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xe5,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte a5, off, s101 offset:-1
+
+// GFX90A: scratch_load_ubyte a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xe6,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte a5, off, flat_scratch_lo offset:-1
+
+// GFX90A: scratch_load_ubyte a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xe7,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte a5, off, flat_scratch_hi offset:-1
+
+// GFX90A: scratch_load_ubyte a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xea,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte a5, off, vcc_lo offset:-1
+
+// GFX90A: scratch_load_ubyte a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xeb,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte a5, off, vcc_hi offset:-1
+
+// GFX90A: scratch_load_ubyte a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte a5, v0, off offset:-1
+
+// GFX90A: scratch_load_ubyte a5, off, s2  ; encoding: [0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte a5, off, s2
+
+// GFX90A: scratch_load_ubyte a5, off, s2  ; encoding: [0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte a5, off, s2
+
+// GFX90A: scratch_load_ubyte a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x40,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte a5, off, s2 offset:4095
+
+// GFX90A: scratch_load_ubyte a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x40,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte a5, off, s2 offset:-4096
+
+// GFX90A: scratch_load_ubyte a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x41,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte a5, off, s2 offset:-1 glc
+
+// GFX90A: scratch_load_ubyte a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x42,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte a5, off, s2 offset:-1 slc
+
+// GFX90A: scratch_load_sbyte a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte a5, off, s2 offset:-1
+
+// GFX90A: scratch_load_sbyte a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte a255, off, s2 offset:-1
+
+// GFX90A: scratch_load_sbyte a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xe5,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte a5, off, s101 offset:-1
+
+// GFX90A: scratch_load_sbyte a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xe6,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte a5, off, flat_scratch_lo offset:-1
+
+// GFX90A: scratch_load_sbyte a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xe7,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte a5, off, flat_scratch_hi offset:-1
+
+// GFX90A: scratch_load_sbyte a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xea,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte a5, off, vcc_lo offset:-1
+
+// GFX90A: scratch_load_sbyte a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xeb,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte a5, off, vcc_hi offset:-1
+
+// GFX90A: scratch_load_sbyte a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte a5, v0, off offset:-1
+
+// GFX90A: scratch_load_sbyte a5, off, s2  ; encoding: [0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte a5, off, s2
+
+// GFX90A: scratch_load_sbyte a5, off, s2  ; encoding: [0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte a5, off, s2
+
+// GFX90A: scratch_load_sbyte a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x44,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte a5, off, s2 offset:4095
+
+// GFX90A: scratch_load_sbyte a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x44,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte a5, off, s2 offset:-4096
+
+// GFX90A: scratch_load_sbyte a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x45,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte a5, off, s2 offset:-1 glc
+
+// GFX90A: scratch_load_sbyte a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x46,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte a5, off, s2 offset:-1 slc
+
+// GFX90A: scratch_load_ushort a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ushort a5, off, s2 offset:-1
+
+// GFX90A: scratch_load_ushort a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ushort a255, off, s2 offset:-1
+
+// GFX90A: scratch_load_ushort a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xe5,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ushort a5, off, s101 offset:-1
+
+// GFX90A: scratch_load_ushort a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xe6,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ushort a5, off, flat_scratch_lo offset:-1
+
+// GFX90A: scratch_load_ushort a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xe7,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ushort a5, off, flat_scratch_hi offset:-1
+
+// GFX90A: scratch_load_ushort a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xea,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ushort a5, off, vcc_lo offset:-1
+
+// GFX90A: scratch_load_ushort a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xeb,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ushort a5, off, vcc_hi offset:-1
+
+// GFX90A: scratch_load_ushort a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ushort a5, v0, off offset:-1
+
+// GFX90A: scratch_load_ushort a5, off, s2 ; encoding: [0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ushort a5, off, s2
+
+// GFX90A: scratch_load_ushort a5, off, s2 ; encoding: [0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ushort a5, off, s2
+
+// GFX90A: scratch_load_ushort a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x48,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ushort a5, off, s2 offset:4095
+
+// GFX90A: scratch_load_ushort a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x48,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ushort a5, off, s2 offset:-4096
+
+// GFX90A: scratch_load_ushort a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x49,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ushort a5, off, s2 offset:-1 glc
+
+// GFX90A: scratch_load_ushort a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x4a,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ushort a5, off, s2 offset:-1 slc
+
+// GFX90A: scratch_load_sshort a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sshort a5, off, s2 offset:-1
+
+// GFX90A: scratch_load_sshort a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sshort a255, off, s2 offset:-1
+
+// GFX90A: scratch_load_sshort a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xe5,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sshort a5, off, s101 offset:-1
+
+// GFX90A: scratch_load_sshort a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xe6,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sshort a5, off, flat_scratch_lo offset:-1
+
+// GFX90A: scratch_load_sshort a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xe7,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sshort a5, off, flat_scratch_hi offset:-1
+
+// GFX90A: scratch_load_sshort a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xea,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sshort a5, off, vcc_lo offset:-1
+
+// GFX90A: scratch_load_sshort a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xeb,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sshort a5, off, vcc_hi offset:-1
+
+// GFX90A: scratch_load_sshort a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sshort a5, v0, off offset:-1
+
+// GFX90A: scratch_load_sshort a5, off, s2 ; encoding: [0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sshort a5, off, s2
+
+// GFX90A: scratch_load_sshort a5, off, s2 ; encoding: [0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sshort a5, off, s2
+
+// GFX90A: scratch_load_sshort a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x4c,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sshort a5, off, s2 offset:4095
+
+// GFX90A: scratch_load_sshort a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x4c,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sshort a5, off, s2 offset:-4096
+
+// GFX90A: scratch_load_sshort a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x4d,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sshort a5, off, s2 offset:-1 glc
+
+// GFX90A: scratch_load_sshort a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x4e,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sshort a5, off, s2 offset:-1 slc
+
+// GFX90A: scratch_load_dword a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dword a5, off, s2 offset:-1
+
+// GFX90A: scratch_load_dword a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dword a255, off, s2 offset:-1
+
+// GFX90A: scratch_load_dword a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xe5,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dword a5, off, s101 offset:-1
+
+// GFX90A: scratch_load_dword a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xe6,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dword a5, off, flat_scratch_lo offset:-1
+
+// GFX90A: scratch_load_dword a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xe7,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dword a5, off, flat_scratch_hi offset:-1
+
+// GFX90A: scratch_load_dword a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xea,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dword a5, off, vcc_lo offset:-1
+
+// GFX90A: scratch_load_dword a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xeb,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dword a5, off, vcc_hi offset:-1
+
+// GFX90A: scratch_load_dword a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dword a5, v0, off offset:-1
+
+// GFX90A: scratch_load_dword a5, off, s2  ; encoding: [0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dword a5, off, s2
+
+// GFX90A: scratch_load_dword a5, off, s2  ; encoding: [0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dword a5, off, s2
+
+// GFX90A: scratch_load_dword a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x50,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dword a5, off, s2 offset:4095
+
+// GFX90A: scratch_load_dword a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x50,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dword a5, off, s2 offset:-4096
+
+// GFX90A: scratch_load_dword a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x51,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dword a5, off, s2 offset:-1 glc
+
+// GFX90A: scratch_load_dword a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x52,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dword a5, off, s2 offset:-1 slc
+
+// GFX90A: scratch_load_dwordx2 a[6:7], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx2 a[6:7], off, s2 offset:-1
+
+// GFX90A: scratch_load_dwordx2 a[254:255], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx2 a[254:255], off, s2 offset:-1
+
+// GFX90A: scratch_load_dwordx2 a[6:7], off, s101 offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xe5,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx2 a[6:7], off, s101 offset:-1
+
+// GFX90A: scratch_load_dwordx2 a[6:7], off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xe6,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx2 a[6:7], off, flat_scratch_lo offset:-1
+
+// GFX90A: scratch_load_dwordx2 a[6:7], off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xe7,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx2 a[6:7], off, flat_scratch_hi offset:-1
+
+// GFX90A: scratch_load_dwordx2 a[6:7], off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xea,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx2 a[6:7], off, vcc_lo offset:-1
+
+// GFX90A: scratch_load_dwordx2 a[6:7], off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xeb,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx2 a[6:7], off, vcc_hi offset:-1
+
+// GFX90A: scratch_load_dwordx2 a[6:7], v0, off offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xff,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx2 a[6:7], v0, off offset:-1
+
+// GFX90A: scratch_load_dwordx2 a[6:7], off, s2 ; encoding: [0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx2 a[6:7], off, s2
+
+// GFX90A: scratch_load_dwordx2 a[6:7], off, s2 ; encoding: [0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx2 a[6:7], off, s2
+
+// GFX90A: scratch_load_dwordx2 a[6:7], off, s2 offset:4095 ; encoding: [0xff,0x4f,0x54,0xdc,0x00,0x00,0x82,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx2 a[6:7], off, s2 offset:4095
+
+// GFX90A: scratch_load_dwordx2 a[6:7], off, s2 offset:-4096 ; encoding: [0x00,0x50,0x54,0xdc,0x00,0x00,0x82,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx2 a[6:7], off, s2 offset:-4096
+
+// GFX90A: scratch_load_dwordx2 a[6:7], off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x55,0xdc,0x00,0x00,0x82,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx2 a[6:7], off, s2 offset:-1 glc
+
+// GFX90A: scratch_load_dwordx2 a[6:7], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x56,0xdc,0x00,0x00,0x82,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx2 a[6:7], off, s2 offset:-1 slc
+
+// GFX90A: scratch_load_dwordx3 a[6:8], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx3 a[6:8], off, s2 offset:-1
+
+// GFX90A: scratch_load_dwordx3 a[252:254], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0xfc]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx3 a[252:254], off, s2 offset:-1
+
+// GFX90A: scratch_load_dwordx3 a[6:8], off, s101 offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xe5,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx3 a[6:8], off, s101 offset:-1
+
+// GFX90A: scratch_load_dwordx3 a[6:8], off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xe6,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx3 a[6:8], off, flat_scratch_lo offset:-1
+
+// GFX90A: scratch_load_dwordx3 a[6:8], off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xe7,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx3 a[6:8], off, flat_scratch_hi offset:-1
+
+// GFX90A: scratch_load_dwordx3 a[6:8], off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xea,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx3 a[6:8], off, vcc_lo offset:-1
+
+// GFX90A: scratch_load_dwordx3 a[6:8], off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xeb,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx3 a[6:8], off, vcc_hi offset:-1
+
+// GFX90A: scratch_load_dwordx3 a[6:8], v0, off offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xff,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx3 a[6:8], v0, off offset:-1
+
+// GFX90A: scratch_load_dwordx3 a[6:8], off, s2 ; encoding: [0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx3 a[6:8], off, s2
+
+// GFX90A: scratch_load_dwordx3 a[6:8], off, s2 ; encoding: [0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx3 a[6:8], off, s2
+
+// GFX90A: scratch_load_dwordx3 a[6:8], off, s2 offset:4095 ; encoding: [0xff,0x4f,0x58,0xdc,0x00,0x00,0x82,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx3 a[6:8], off, s2 offset:4095
+
+// GFX90A: scratch_load_dwordx3 a[6:8], off, s2 offset:-4096 ; encoding: [0x00,0x50,0x58,0xdc,0x00,0x00,0x82,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx3 a[6:8], off, s2 offset:-4096
+
+// GFX90A: scratch_load_dwordx3 a[6:8], off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x59,0xdc,0x00,0x00,0x82,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx3 a[6:8], off, s2 offset:-1 glc
+
+// GFX90A: scratch_load_dwordx3 a[6:8], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x5a,0xdc,0x00,0x00,0x82,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx3 a[6:8], off, s2 offset:-1 slc
+
+// GFX90A: scratch_load_dwordx4 a[6:9], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx4 a[6:9], off, s2 offset:-1
+
+// GFX90A: scratch_load_dwordx4 a[252:255], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0xfc]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx4 a[252:255], off, s2 offset:-1
+
+// GFX90A: scratch_load_dwordx4 a[6:9], off, s101 offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xe5,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx4 a[6:9], off, s101 offset:-1
+
+// GFX90A: scratch_load_dwordx4 a[6:9], off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xe6,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx4 a[6:9], off, flat_scratch_lo offset:-1
+
+// GFX90A: scratch_load_dwordx4 a[6:9], off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xe7,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx4 a[6:9], off, flat_scratch_hi offset:-1
+
+// GFX90A: scratch_load_dwordx4 a[6:9], off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xea,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx4 a[6:9], off, vcc_lo offset:-1
+
+// GFX90A: scratch_load_dwordx4 a[6:9], off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xeb,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx4 a[6:9], off, vcc_hi offset:-1
+
+// GFX90A: scratch_load_dwordx4 a[6:9], v0, off offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xff,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx4 a[6:9], v0, off offset:-1
+
+// GFX90A: scratch_load_dwordx4 a[6:9], off, s2 ; encoding: [0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx4 a[6:9], off, s2
+
+// GFX90A: scratch_load_dwordx4 a[6:9], off, s2 ; encoding: [0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx4 a[6:9], off, s2
+
+// GFX90A: scratch_load_dwordx4 a[6:9], off, s2 offset:4095 ; encoding: [0xff,0x4f,0x5c,0xdc,0x00,0x00,0x82,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx4 a[6:9], off, s2 offset:4095
+
+// GFX90A: scratch_load_dwordx4 a[6:9], off, s2 offset:-4096 ; encoding: [0x00,0x50,0x5c,0xdc,0x00,0x00,0x82,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx4 a[6:9], off, s2 offset:-4096
+
+// GFX90A: scratch_load_dwordx4 a[6:9], off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x5d,0xdc,0x00,0x00,0x82,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx4 a[6:9], off, s2 offset:-1 glc
+
+// GFX90A: scratch_load_dwordx4 a[6:9], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x5e,0xdc,0x00,0x00,0x82,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_dwordx4 a[6:9], off, s2 offset:-1 slc
+
+// GFX90A: scratch_store_byte off, a2, s3 offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte off, a2, s3 offset:-1
+
+// GFX90A: scratch_store_byte off, a255, s3 offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0xff,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte off, a255, s3 offset:-1
+
+// GFX90A: scratch_store_byte off, a2, s101 offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xe5,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte off, a2, s101 offset:-1
+
+// GFX90A: scratch_store_byte off, a2, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xe6,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte off, a2, flat_scratch_lo offset:-1
+
+// GFX90A: scratch_store_byte off, a2, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xe7,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte off, a2, flat_scratch_hi offset:-1
+
+// GFX90A: scratch_store_byte off, a2, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xea,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte off, a2, vcc_lo offset:-1
+
+// GFX90A: scratch_store_byte off, a2, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xeb,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte off, a2, vcc_hi offset:-1
+
+// GFX90A: scratch_store_byte v0, a2, off offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte v0, a2, off offset:-1
+
+// GFX90A: scratch_store_byte off, a2, s3  ; encoding: [0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte off, a2, s3
+
+// GFX90A: scratch_store_byte off, a2, s3  ; encoding: [0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte off, a2, s3
+
+// GFX90A: scratch_store_byte off, a2, s3 offset:4095 ; encoding: [0xff,0x4f,0x60,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte off, a2, s3 offset:4095
+
+// GFX90A: scratch_store_byte off, a2, s3 offset:-4096 ; encoding: [0x00,0x50,0x60,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte off, a2, s3 offset:-4096
+
+// GFX90A: scratch_store_byte off, a2, s3 offset:-1 glc ; encoding: [0xff,0x5f,0x61,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte off, a2, s3 offset:-1 glc
+
+// GFX90A: scratch_store_byte off, a2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x62,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte off, a2, s3 offset:-1 slc
+
+// GFX90A: scratch_store_byte_d16_hi off, a2, s3 offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte_d16_hi off, a2, s3 offset:-1
+
+// GFX90A: scratch_store_byte_d16_hi off, a255, s3 offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0xff,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte_d16_hi off, a255, s3 offset:-1
+
+// GFX90A: scratch_store_byte_d16_hi off, a2, s101 offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xe5,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte_d16_hi off, a2, s101 offset:-1
+
+// GFX90A: scratch_store_byte_d16_hi off, a2, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xe6,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte_d16_hi off, a2, flat_scratch_lo offset:-1
+
+// GFX90A: scratch_store_byte_d16_hi off, a2, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xe7,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte_d16_hi off, a2, flat_scratch_hi offset:-1
+
+// GFX90A: scratch_store_byte_d16_hi off, a2, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xea,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte_d16_hi off, a2, vcc_lo offset:-1
+
+// GFX90A: scratch_store_byte_d16_hi off, a2, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xeb,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte_d16_hi off, a2, vcc_hi offset:-1
+
+// GFX90A: scratch_store_byte_d16_hi v0, a2, off offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte_d16_hi v0, a2, off offset:-1
+
+// GFX90A: scratch_store_byte_d16_hi off, a2, s3 ; encoding: [0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte_d16_hi off, a2, s3
+
+// GFX90A: scratch_store_byte_d16_hi off, a2, s3 ; encoding: [0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte_d16_hi off, a2, s3
+
+// GFX90A: scratch_store_byte_d16_hi off, a2, s3 offset:4095 ; encoding: [0xff,0x4f,0x64,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte_d16_hi off, a2, s3 offset:4095
+
+// GFX90A: scratch_store_byte_d16_hi off, a2, s3 offset:-4096 ; encoding: [0x00,0x50,0x64,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte_d16_hi off, a2, s3 offset:-4096
+
+// GFX90A: scratch_store_byte_d16_hi off, a2, s3 offset:-1 glc ; encoding: [0xff,0x5f,0x65,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte_d16_hi off, a2, s3 offset:-1 glc
+
+// GFX90A: scratch_store_byte_d16_hi off, a2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x66,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_byte_d16_hi off, a2, s3 offset:-1 slc
+
+// GFX90A: scratch_store_short off, a2, s3 offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short off, a2, s3 offset:-1
+
+// GFX90A: scratch_store_short off, a255, s3 offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0xff,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short off, a255, s3 offset:-1
+
+// GFX90A: scratch_store_short off, a2, s101 offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xe5,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short off, a2, s101 offset:-1
+
+// GFX90A: scratch_store_short off, a2, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xe6,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short off, a2, flat_scratch_lo offset:-1
+
+// GFX90A: scratch_store_short off, a2, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xe7,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short off, a2, flat_scratch_hi offset:-1
+
+// GFX90A: scratch_store_short off, a2, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xea,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short off, a2, vcc_lo offset:-1
+
+// GFX90A: scratch_store_short off, a2, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xeb,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short off, a2, vcc_hi offset:-1
+
+// GFX90A: scratch_store_short v0, a2, off offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short v0, a2, off offset:-1
+
+// GFX90A: scratch_store_short off, a2, s3 ; encoding: [0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short off, a2, s3
+
+// GFX90A: scratch_store_short off, a2, s3 ; encoding: [0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short off, a2, s3
+
+// GFX90A: scratch_store_short off, a2, s3 offset:4095 ; encoding: [0xff,0x4f,0x68,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short off, a2, s3 offset:4095
+
+// GFX90A: scratch_store_short off, a2, s3 offset:-4096 ; encoding: [0x00,0x50,0x68,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short off, a2, s3 offset:-4096
+
+// GFX90A: scratch_store_short off, a2, s3 offset:-1 glc ; encoding: [0xff,0x5f,0x69,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short off, a2, s3 offset:-1 glc
+
+// GFX90A: scratch_store_short off, a2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x6a,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short off, a2, s3 offset:-1 slc
+
+// GFX90A: scratch_store_short_d16_hi off, a2, s3 offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short_d16_hi off, a2, s3 offset:-1
+
+// GFX90A: scratch_store_short_d16_hi off, a255, s3 offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0xff,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short_d16_hi off, a255, s3 offset:-1
+
+// GFX90A: scratch_store_short_d16_hi off, a2, s101 offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xe5,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short_d16_hi off, a2, s101 offset:-1
+
+// GFX90A: scratch_store_short_d16_hi off, a2, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xe6,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short_d16_hi off, a2, flat_scratch_lo offset:-1
+
+// GFX90A: scratch_store_short_d16_hi off, a2, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xe7,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short_d16_hi off, a2, flat_scratch_hi offset:-1
+
+// GFX90A: scratch_store_short_d16_hi off, a2, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xea,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short_d16_hi off, a2, vcc_lo offset:-1
+
+// GFX90A: scratch_store_short_d16_hi off, a2, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xeb,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short_d16_hi off, a2, vcc_hi offset:-1
+
+// GFX90A: scratch_store_short_d16_hi v0, a2, off offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short_d16_hi v0, a2, off offset:-1
+
+// GFX90A: scratch_store_short_d16_hi off, a2, s3 ; encoding: [0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short_d16_hi off, a2, s3
+
+// GFX90A: scratch_store_short_d16_hi off, a2, s3 ; encoding: [0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short_d16_hi off, a2, s3
+
+// GFX90A: scratch_store_short_d16_hi off, a2, s3 offset:4095 ; encoding: [0xff,0x4f,0x6c,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short_d16_hi off, a2, s3 offset:4095
+
+// GFX90A: scratch_store_short_d16_hi off, a2, s3 offset:-4096 ; encoding: [0x00,0x50,0x6c,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short_d16_hi off, a2, s3 offset:-4096
+
+// GFX90A: scratch_store_short_d16_hi off, a2, s3 offset:-1 glc ; encoding: [0xff,0x5f,0x6d,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short_d16_hi off, a2, s3 offset:-1 glc
+
+// GFX90A: scratch_store_short_d16_hi off, a2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x6e,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_short_d16_hi off, a2, s3 offset:-1 slc
+
+// GFX90A: scratch_store_dword off, a2, s3 offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dword off, a2, s3 offset:-1
+
+// GFX90A: scratch_store_dword off, a255, s3 offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0xff,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dword off, a255, s3 offset:-1
+
+// GFX90A: scratch_store_dword off, a2, s101 offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xe5,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dword off, a2, s101 offset:-1
+
+// GFX90A: scratch_store_dword off, a2, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xe6,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dword off, a2, flat_scratch_lo offset:-1
+
+// GFX90A: scratch_store_dword off, a2, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xe7,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dword off, a2, flat_scratch_hi offset:-1
+
+// GFX90A: scratch_store_dword off, a2, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xea,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dword off, a2, vcc_lo offset:-1
+
+// GFX90A: scratch_store_dword off, a2, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xeb,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dword off, a2, vcc_hi offset:-1
+
+// GFX90A: scratch_store_dword v0, a2, off offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dword v0, a2, off offset:-1
+
+// GFX90A: scratch_store_dword off, a2, s3 ; encoding: [0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dword off, a2, s3
+
+// GFX90A: scratch_store_dword off, a2, s3 ; encoding: [0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dword off, a2, s3
+
+// GFX90A: scratch_store_dword off, a2, s3 offset:4095 ; encoding: [0xff,0x4f,0x70,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dword off, a2, s3 offset:4095
+
+// GFX90A: scratch_store_dword off, a2, s3 offset:-4096 ; encoding: [0x00,0x50,0x70,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dword off, a2, s3 offset:-4096
+
+// GFX90A: scratch_store_dword off, a2, s3 offset:-1 glc ; encoding: [0xff,0x5f,0x71,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dword off, a2, s3 offset:-1 glc
+
+// GFX90A: scratch_store_dword off, a2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x72,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dword off, a2, s3 offset:-1 slc
+
+// GFX90A: scratch_store_dwordx2 off, a[2:3], s3 offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx2 off, a[2:3], s3 offset:-1
+
+// GFX90A: scratch_store_dwordx2 off, a[254:255], s3 offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0xfe,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx2 off, a[254:255], s3 offset:-1
+
+// GFX90A: scratch_store_dwordx2 off, a[2:3], s101 offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xe5,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx2 off, a[2:3], s101 offset:-1
+
+// GFX90A: scratch_store_dwordx2 off, a[2:3], flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xe6,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx2 off, a[2:3], flat_scratch_lo offset:-1
+
+// GFX90A: scratch_store_dwordx2 off, a[2:3], flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xe7,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx2 off, a[2:3], flat_scratch_hi offset:-1
+
+// GFX90A: scratch_store_dwordx2 off, a[2:3], vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xea,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx2 off, a[2:3], vcc_lo offset:-1
+
+// GFX90A: scratch_store_dwordx2 off, a[2:3], vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xeb,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx2 off, a[2:3], vcc_hi offset:-1
+
+// GFX90A: scratch_store_dwordx2 v0, a[2:3], off offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx2 v0, a[2:3], off offset:-1
+
+// GFX90A: scratch_store_dwordx2 off, a[2:3], s3 ; encoding: [0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx2 off, a[2:3], s3
+
+// GFX90A: scratch_store_dwordx2 off, a[2:3], s3 ; encoding: [0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx2 off, a[2:3], s3
+
+// GFX90A: scratch_store_dwordx2 off, a[2:3], s3 offset:4095 ; encoding: [0xff,0x4f,0x74,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx2 off, a[2:3], s3 offset:4095
+
+// GFX90A: scratch_store_dwordx2 off, a[2:3], s3 offset:-4096 ; encoding: [0x00,0x50,0x74,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx2 off, a[2:3], s3 offset:-4096
+
+// GFX90A: scratch_store_dwordx2 off, a[2:3], s3 offset:-1 glc ; encoding: [0xff,0x5f,0x75,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx2 off, a[2:3], s3 offset:-1 glc
+
+// GFX90A: scratch_store_dwordx2 off, a[2:3], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x76,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx2 off, a[2:3], s3 offset:-1 slc
+
+// GFX90A: scratch_store_dwordx3 off, a[2:4], s3 offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx3 off, a[2:4], s3 offset:-1
+
+// GFX90A: scratch_store_dwordx3 off, a[252:254], s3 offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0xfc,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx3 off, a[252:254], s3 offset:-1
+
+// GFX90A: scratch_store_dwordx3 off, a[2:4], s101 offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xe5,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx3 off, a[2:4], s101 offset:-1
+
+// GFX90A: scratch_store_dwordx3 off, a[2:4], flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xe6,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx3 off, a[2:4], flat_scratch_lo offset:-1
+
+// GFX90A: scratch_store_dwordx3 off, a[2:4], flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xe7,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx3 off, a[2:4], flat_scratch_hi offset:-1
+
+// GFX90A: scratch_store_dwordx3 off, a[2:4], vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xea,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx3 off, a[2:4], vcc_lo offset:-1
+
+// GFX90A: scratch_store_dwordx3 off, a[2:4], vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xeb,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx3 off, a[2:4], vcc_hi offset:-1
+
+// GFX90A: scratch_store_dwordx3 v0, a[2:4], off offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx3 v0, a[2:4], off offset:-1
+
+// GFX90A: scratch_store_dwordx3 off, a[2:4], s3 ; encoding: [0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx3 off, a[2:4], s3
+
+// GFX90A: scratch_store_dwordx3 off, a[2:4], s3 ; encoding: [0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx3 off, a[2:4], s3
+
+// GFX90A: scratch_store_dwordx3 off, a[2:4], s3 offset:4095 ; encoding: [0xff,0x4f,0x78,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx3 off, a[2:4], s3 offset:4095
+
+// GFX90A: scratch_store_dwordx3 off, a[2:4], s3 offset:-4096 ; encoding: [0x00,0x50,0x78,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx3 off, a[2:4], s3 offset:-4096
+
+// GFX90A: scratch_store_dwordx3 off, a[2:4], s3 offset:-1 glc ; encoding: [0xff,0x5f,0x79,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx3 off, a[2:4], s3 offset:-1 glc
+
+// GFX90A: scratch_store_dwordx3 off, a[2:4], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x7a,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx3 off, a[2:4], s3 offset:-1 slc
+
+// GFX90A: scratch_store_dwordx4 off, a[2:5], s3 offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx4 off, a[2:5], s3 offset:-1
+
+// GFX90A: scratch_store_dwordx4 off, a[252:255], s3 offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0xfc,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx4 off, a[252:255], s3 offset:-1
+
+// GFX90A: scratch_store_dwordx4 off, a[2:5], s101 offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xe5,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx4 off, a[2:5], s101 offset:-1
+
+// GFX90A: scratch_store_dwordx4 off, a[2:5], flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xe6,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx4 off, a[2:5], flat_scratch_lo offset:-1
+
+// GFX90A: scratch_store_dwordx4 off, a[2:5], flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xe7,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx4 off, a[2:5], flat_scratch_hi offset:-1
+
+// GFX90A: scratch_store_dwordx4 off, a[2:5], vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xea,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx4 off, a[2:5], vcc_lo offset:-1
+
+// GFX90A: scratch_store_dwordx4 off, a[2:5], vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xeb,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx4 off, a[2:5], vcc_hi offset:-1
+
+// GFX90A: scratch_store_dwordx4 v0, a[2:5], off offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx4 v0, a[2:5], off offset:-1
+
+// GFX90A: scratch_store_dwordx4 off, a[2:5], s3 ; encoding: [0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx4 off, a[2:5], s3
+
+// GFX90A: scratch_store_dwordx4 off, a[2:5], s3 ; encoding: [0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx4 off, a[2:5], s3
+
+// GFX90A: scratch_store_dwordx4 off, a[2:5], s3 offset:4095 ; encoding: [0xff,0x4f,0x7c,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx4 off, a[2:5], s3 offset:4095
+
+// GFX90A: scratch_store_dwordx4 off, a[2:5], s3 offset:-4096 ; encoding: [0x00,0x50,0x7c,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx4 off, a[2:5], s3 offset:-4096
+
+// GFX90A: scratch_store_dwordx4 off, a[2:5], s3 offset:-1 glc ; encoding: [0xff,0x5f,0x7d,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx4 off, a[2:5], s3 offset:-1 glc
+
+// GFX90A: scratch_store_dwordx4 off, a[2:5], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x7e,0xdc,0x00,0x02,0x83,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_store_dwordx4 off, a[2:5], s3 offset:-1 slc
+
+// GFX90A: scratch_load_ubyte_d16 a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16 a5, off, s2 offset:-1
+
+// GFX90A: scratch_load_ubyte_d16 a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16 a255, off, s2 offset:-1
+
+// GFX90A: scratch_load_ubyte_d16 a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xe5,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16 a5, off, s101 offset:-1
+
+// GFX90A: scratch_load_ubyte_d16 a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xe6,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16 a5, off, flat_scratch_lo offset:-1
+
+// GFX90A: scratch_load_ubyte_d16 a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xe7,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16 a5, off, flat_scratch_hi offset:-1
+
+// GFX90A: scratch_load_ubyte_d16 a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xea,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16 a5, off, vcc_lo offset:-1
+
+// GFX90A: scratch_load_ubyte_d16 a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xeb,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16 a5, off, vcc_hi offset:-1
+
+// GFX90A: scratch_load_ubyte_d16 a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16 a5, v0, off offset:-1
+
+// GFX90A: scratch_load_ubyte_d16 a5, off, s2 ; encoding: [0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16 a5, off, s2
+
+// GFX90A: scratch_load_ubyte_d16 a5, off, s2 ; encoding: [0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16 a5, off, s2
+
+// GFX90A: scratch_load_ubyte_d16 a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x80,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16 a5, off, s2 offset:4095
+
+// GFX90A: scratch_load_ubyte_d16 a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x80,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16 a5, off, s2 offset:-4096
+
+// GFX90A: scratch_load_ubyte_d16 a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x81,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16 a5, off, s2 offset:-1 glc
+
+// GFX90A: scratch_load_ubyte_d16 a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x82,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16 a5, off, s2 offset:-1 slc
+
+// GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16_hi a5, off, s2 offset:-1
+
+// GFX90A: scratch_load_ubyte_d16_hi a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16_hi a255, off, s2 offset:-1
+
+// GFX90A: scratch_load_ubyte_d16_hi a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xe5,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16_hi a5, off, s101 offset:-1
+
+// GFX90A: scratch_load_ubyte_d16_hi a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xe6,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16_hi a5, off, flat_scratch_lo offset:-1
+
+// GFX90A: scratch_load_ubyte_d16_hi a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xe7,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16_hi a5, off, flat_scratch_hi offset:-1
+
+// GFX90A: scratch_load_ubyte_d16_hi a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xea,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16_hi a5, off, vcc_lo offset:-1
+
+// GFX90A: scratch_load_ubyte_d16_hi a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xeb,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16_hi a5, off, vcc_hi offset:-1
+
+// GFX90A: scratch_load_ubyte_d16_hi a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16_hi a5, v0, off offset:-1
+
+// GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16_hi a5, off, s2
+
+// GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16_hi a5, off, s2
+
+// GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x84,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16_hi a5, off, s2 offset:4095
+
+// GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x84,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16_hi a5, off, s2 offset:-4096
+
+// GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x85,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16_hi a5, off, s2 offset:-1 glc
+
+// GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x86,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_ubyte_d16_hi a5, off, s2 offset:-1 slc
+
+// GFX90A: scratch_load_sbyte_d16 a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16 a5, off, s2 offset:-1
+
+// GFX90A: scratch_load_sbyte_d16 a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16 a255, off, s2 offset:-1
+
+// GFX90A: scratch_load_sbyte_d16 a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xe5,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16 a5, off, s101 offset:-1
+
+// GFX90A: scratch_load_sbyte_d16 a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xe6,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16 a5, off, flat_scratch_lo offset:-1
+
+// GFX90A: scratch_load_sbyte_d16 a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xe7,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16 a5, off, flat_scratch_hi offset:-1
+
+// GFX90A: scratch_load_sbyte_d16 a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xea,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16 a5, off, vcc_lo offset:-1
+
+// GFX90A: scratch_load_sbyte_d16 a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xeb,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16 a5, off, vcc_hi offset:-1
+
+// GFX90A: scratch_load_sbyte_d16 a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16 a5, v0, off offset:-1
+
+// GFX90A: scratch_load_sbyte_d16 a5, off, s2 ; encoding: [0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16 a5, off, s2
+
+// GFX90A: scratch_load_sbyte_d16 a5, off, s2 ; encoding: [0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16 a5, off, s2
+
+// GFX90A: scratch_load_sbyte_d16 a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x88,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16 a5, off, s2 offset:4095
+
+// GFX90A: scratch_load_sbyte_d16 a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x88,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16 a5, off, s2 offset:-4096
+
+// GFX90A: scratch_load_sbyte_d16 a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x89,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16 a5, off, s2 offset:-1 glc
+
+// GFX90A: scratch_load_sbyte_d16 a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x8a,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16 a5, off, s2 offset:-1 slc
+
+// GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16_hi a5, off, s2 offset:-1
+
+// GFX90A: scratch_load_sbyte_d16_hi a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16_hi a255, off, s2 offset:-1
+
+// GFX90A: scratch_load_sbyte_d16_hi a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xe5,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16_hi a5, off, s101 offset:-1
+
+// GFX90A: scratch_load_sbyte_d16_hi a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xe6,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16_hi a5, off, flat_scratch_lo offset:-1
+
+// GFX90A: scratch_load_sbyte_d16_hi a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xe7,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16_hi a5, off, flat_scratch_hi offset:-1
+
+// GFX90A: scratch_load_sbyte_d16_hi a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xea,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16_hi a5, off, vcc_lo offset:-1
+
+// GFX90A: scratch_load_sbyte_d16_hi a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xeb,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16_hi a5, off, vcc_hi offset:-1
+
+// GFX90A: scratch_load_sbyte_d16_hi a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16_hi a5, v0, off offset:-1
+
+// GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16_hi a5, off, s2
+
+// GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16_hi a5, off, s2
+
+// GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x8c,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16_hi a5, off, s2 offset:4095
+
+// GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x8c,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16_hi a5, off, s2 offset:-4096
+
+// GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x8d,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16_hi a5, off, s2 offset:-1 glc
+
+// GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x8e,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_sbyte_d16_hi a5, off, s2 offset:-1 slc
+
+// GFX90A: scratch_load_short_d16 a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16 a5, off, s2 offset:-1
+
+// GFX90A: scratch_load_short_d16 a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16 a255, off, s2 offset:-1
+
+// GFX90A: scratch_load_short_d16 a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xe5,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16 a5, off, s101 offset:-1
+
+// GFX90A: scratch_load_short_d16 a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xe6,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16 a5, off, flat_scratch_lo offset:-1
+
+// GFX90A: scratch_load_short_d16 a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xe7,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16 a5, off, flat_scratch_hi offset:-1
+
+// GFX90A: scratch_load_short_d16 a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xea,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16 a5, off, vcc_lo offset:-1
+
+// GFX90A: scratch_load_short_d16 a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xeb,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16 a5, off, vcc_hi offset:-1
+
+// GFX90A: scratch_load_short_d16 a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16 a5, v0, off offset:-1
+
+// GFX90A: scratch_load_short_d16 a5, off, s2 ; encoding: [0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16 a5, off, s2
+
+// GFX90A: scratch_load_short_d16 a5, off, s2 ; encoding: [0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16 a5, off, s2
+
+// GFX90A: scratch_load_short_d16 a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x90,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16 a5, off, s2 offset:4095
+
+// GFX90A: scratch_load_short_d16 a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x90,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16 a5, off, s2 offset:-4096
+
+// GFX90A: scratch_load_short_d16 a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x91,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16 a5, off, s2 offset:-1 glc
+
+// GFX90A: scratch_load_short_d16 a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x92,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16 a5, off, s2 offset:-1 slc
+
+// GFX90A: scratch_load_short_d16_hi a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16_hi a5, off, s2 offset:-1
+
+// GFX90A: scratch_load_short_d16_hi a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16_hi a255, off, s2 offset:-1
+
+// GFX90A: scratch_load_short_d16_hi a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xe5,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16_hi a5, off, s101 offset:-1
+
+// GFX90A: scratch_load_short_d16_hi a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xe6,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16_hi a5, off, flat_scratch_lo offset:-1
+
+// GFX90A: scratch_load_short_d16_hi a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xe7,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16_hi a5, off, flat_scratch_hi offset:-1
+
+// GFX90A: scratch_load_short_d16_hi a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xea,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16_hi a5, off, vcc_lo offset:-1
+
+// GFX90A: scratch_load_short_d16_hi a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xeb,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16_hi a5, off, vcc_hi offset:-1
+
+// GFX90A: scratch_load_short_d16_hi a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16_hi a5, v0, off offset:-1
+
+// GFX90A: scratch_load_short_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16_hi a5, off, s2
+
+// GFX90A: scratch_load_short_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16_hi a5, off, s2
+
+// GFX90A: scratch_load_short_d16_hi a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x94,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16_hi a5, off, s2 offset:4095
+
+// GFX90A: scratch_load_short_d16_hi a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x94,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16_hi a5, off, s2 offset:-4096
+
+// GFX90A: scratch_load_short_d16_hi a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x95,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16_hi a5, off, s2 offset:-1 glc
+
+// GFX90A: scratch_load_short_d16_hi a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x96,0xdc,0x00,0x00,0x82,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+scratch_load_short_d16_hi a5, off, s2 offset:-1 slc
+
+// GFX90A: buffer_load_format_x a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_x a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_format_x a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_x a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_format_x a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_x a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_format_x a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_x a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_format_x a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_x a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_format_x a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_x a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_format_x a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_x a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_format_x a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_x a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_format_x a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_x a5, off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_format_x a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_x a5, off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_format_x a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x00,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_x a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_format_x a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x00,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_x a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_format_x a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x00,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_x a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_format_x a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x00,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_x a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_format_x a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x00,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_x a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_format_x a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x00,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_x a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_format_x a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x02,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_x a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_load_format_x a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x01,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_x a5, off, s[8:11], s3 offset:4095 lds
+
+// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_format_xy a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0xfe,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xy a[254:255], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_format_xy a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xy a[6:7], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_format_xy a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xy a[6:7], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xy a[6:7], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xy a[6:7], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xy a[6:7], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xy a[6:7], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xy a[6:7], off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xy a[6:7], off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_format_xy a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x04,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xy a[6:7], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_format_xy a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x04,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xy a[6:7], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x04,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xy a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x04,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xy a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x04,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x04,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x06,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_format_xyz a[252:254], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0xfc,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyz a[252:254], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_format_xyz a[6:8], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyz a[6:8], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_format_xyz a[6:8], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyz a[6:8], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyz a[6:8], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyz a[6:8], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyz a[6:8], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyz a[6:8], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyz a[6:8], off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyz a[6:8], off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_format_xyz a[6:8], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x08,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyz a[6:8], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_format_xyz a[6:8], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x08,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyz a[6:8], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 ; encoding: [0x00,0x00,0x08,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyz a[6:8], off, s[8:11], s3
+
+// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 ; encoding: [0x00,0x00,0x08,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyz a[6:8], off, s[8:11], s3
+
+// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x08,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x08,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x0a,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_format_xyzw a[252:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0xfc,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyzw a[252:255], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_format_xyzw a[6:9], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyzw a[6:9], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_format_xyzw a[6:9], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyzw a[6:9], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyzw a[6:9], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyzw a[6:9], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyzw a[6:9], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyzw a[6:9], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyzw a[6:9], off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyzw a[6:9], off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_format_xyzw a[6:9], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x0c,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyzw a[6:9], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_format_xyzw a[6:9], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x0c,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyzw a[6:9], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x0c,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyzw a[6:9], off, s[8:11], s3
+
+// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x0c,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyzw a[6:9], off, s[8:11], s3
+
+// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x0c,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x0c,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x0e,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_store_format_x a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_x a1, off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_format_x a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0xff,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_x a255, off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_format_x a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x84,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_x a1, off, s[16:19], s4 offset:4095
+
+// GFX90A: buffer_store_format_x a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x98,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_x a1, off, s[96:99], s4 offset:4095
+
+// GFX90A: buffer_store_format_x a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_x a1, off, s[12:15], s101 offset:4095
+
+// GFX90A: buffer_store_format_x a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_x a1, off, s[12:15], m0 offset:4095
+
+// GFX90A: buffer_store_format_x a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_x a1, off, s[12:15], 0 offset:4095
+
+// GFX90A: buffer_store_format_x a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_x a1, off, s[12:15], -1 offset:4095
+
+// GFX90A: buffer_store_format_x a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_x a1, off, s[12:15], 0.5 offset:4095
+
+// GFX90A: buffer_store_format_x a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_x a1, off, s[12:15], -4.0 offset:4095
+
+// GFX90A: buffer_store_format_x a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x10,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_x a1, v0, s[12:15], s4 idxen offset:4095
+
+// GFX90A: buffer_store_format_x a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x10,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_x a1, v0, s[12:15], s4 offen offset:4095
+
+// GFX90A: buffer_store_format_x a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x10,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_x a1, off, s[12:15], s4
+
+// GFX90A: buffer_store_format_x a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x10,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_x a1, off, s[12:15], s4
+
+// GFX90A: buffer_store_format_x a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x10,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_x a1, off, s[12:15], s4 offset:7
+
+// GFX90A: buffer_store_format_x a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x10,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_x a1, off, s[12:15], s4 offset:4095 glc
+
+// GFX90A: buffer_store_format_x a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x12,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_x a1, off, s[12:15], s4 offset:4095 slc
+
+// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_format_xy a[254:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0xfe,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xy a[254:255], off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_format_xy a[2:3], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x84,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xy a[2:3], off, s[16:19], s4 offset:4095
+
+// GFX90A: buffer_store_format_xy a[2:3], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x98,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xy a[2:3], off, s[96:99], s4 offset:4095
+
+// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xy a[2:3], off, s[12:15], s101 offset:4095
+
+// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xy a[2:3], off, s[12:15], m0 offset:4095
+
+// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xy a[2:3], off, s[12:15], 0 offset:4095
+
+// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xy a[2:3], off, s[12:15], -1 offset:4095
+
+// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xy a[2:3], off, s[12:15], 0.5 offset:4095
+
+// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xy a[2:3], off, s[12:15], -4.0 offset:4095
+
+// GFX90A: buffer_store_format_xy a[2:3], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x14,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xy a[2:3], v0, s[12:15], s4 idxen offset:4095
+
+// GFX90A: buffer_store_format_xy a[2:3], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x14,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xy a[2:3], v0, s[12:15], s4 offen offset:4095
+
+// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x14,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xy a[2:3], off, s[12:15], s4
+
+// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x14,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xy a[2:3], off, s[12:15], s4
+
+// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x14,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:7
+
+// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x14,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:4095 glc
+
+// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x16,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:4095 slc
+
+// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_format_xyz a[252:254], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0xfc,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyz a[252:254], off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_format_xyz a[2:4], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x84,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyz a[2:4], off, s[16:19], s4 offset:4095
+
+// GFX90A: buffer_store_format_xyz a[2:4], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x98,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyz a[2:4], off, s[96:99], s4 offset:4095
+
+// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyz a[2:4], off, s[12:15], s101 offset:4095
+
+// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyz a[2:4], off, s[12:15], m0 offset:4095
+
+// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyz a[2:4], off, s[12:15], 0 offset:4095
+
+// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyz a[2:4], off, s[12:15], -1 offset:4095
+
+// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyz a[2:4], off, s[12:15], 0.5 offset:4095
+
+// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyz a[2:4], off, s[12:15], -4.0 offset:4095
+
+// GFX90A: buffer_store_format_xyz a[2:4], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x18,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyz a[2:4], v0, s[12:15], s4 idxen offset:4095
+
+// GFX90A: buffer_store_format_xyz a[2:4], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x18,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyz a[2:4], v0, s[12:15], s4 offen offset:4095
+
+// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 ; encoding: [0x00,0x00,0x18,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyz a[2:4], off, s[12:15], s4
+
+// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 ; encoding: [0x00,0x00,0x18,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyz a[2:4], off, s[12:15], s4
+
+// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x18,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:7
+
+// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x18,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:4095 glc
+
+// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x1a,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:4095 slc
+
+// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_format_xyzw a[252:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0xfc,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyzw a[252:255], off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_format_xyzw a[2:5], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x84,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyzw a[2:5], off, s[16:19], s4 offset:4095
+
+// GFX90A: buffer_store_format_xyzw a[2:5], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x98,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyzw a[2:5], off, s[96:99], s4 offset:4095
+
+// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyzw a[2:5], off, s[12:15], s101 offset:4095
+
+// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyzw a[2:5], off, s[12:15], m0 offset:4095
+
+// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyzw a[2:5], off, s[12:15], 0 offset:4095
+
+// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyzw a[2:5], off, s[12:15], -1 offset:4095
+
+// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyzw a[2:5], off, s[12:15], 0.5 offset:4095
+
+// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyzw a[2:5], off, s[12:15], -4.0 offset:4095
+
+// GFX90A: buffer_store_format_xyzw a[2:5], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x1c,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyzw a[2:5], v0, s[12:15], s4 idxen offset:4095
+
+// GFX90A: buffer_store_format_xyzw a[2:5], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x1c,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyzw a[2:5], v0, s[12:15], s4 offen offset:4095
+
+// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 ; encoding: [0x00,0x00,0x1c,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyzw a[2:5], off, s[12:15], s4
+
+// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 ; encoding: [0x00,0x00,0x1c,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyzw a[2:5], off, s[12:15], s4
+
+// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x1c,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:7
+
+// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x1c,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:4095 glc
+
+// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x1e,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:4095 slc
+
+// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_x a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_format_d16_x a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_x a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_format_d16_x a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_x a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_format_d16_x a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_x a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_x a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_x a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_x a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_x a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_x a5, off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_x a5, off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_format_d16_x a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x20,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_x a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_format_d16_x a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x20,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_x a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x20,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_x a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x20,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_x a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x20,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_x a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x20,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_x a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x22,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_x a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_format_d16_xy a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xy a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_format_d16_xy a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xy a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_format_d16_xy a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xy a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xy a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xy a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xy a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xy a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xy a5, off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xy a5, off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_format_d16_xy a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x24,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xy a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_format_d16_xy a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x24,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xy a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x24,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xy a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x24,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xy a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x24,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x24,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x26,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_format_d16_xyz a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0xfe,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyz a[254:255], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyz a[6:7], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyz a[6:7], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyz a[6:7], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyz a[6:7], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyz a[6:7], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyz a[6:7], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyz a[6:7], off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyz a[6:7], off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_format_d16_xyz a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x28,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyz a[6:7], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_format_d16_xyz a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x28,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyz a[6:7], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x28,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x28,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x2a,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_format_d16_xyzw a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0xfe,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyzw a[254:255], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyzw a[6:7], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyzw a[6:7], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyzw a[6:7], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyzw a[6:7], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyzw a[6:7], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyzw a[6:7], off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyzw a[6:7], off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_format_d16_xyzw a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x2c,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyzw a[6:7], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_format_d16_xyzw a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x2c,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyzw a[6:7], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x2c,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x2c,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x2c,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x2c,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x2e,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_x a1, off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_format_d16_x a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0xff,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_x a255, off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_format_d16_x a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x84,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_x a1, off, s[16:19], s4 offset:4095
+
+// GFX90A: buffer_store_format_d16_x a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x98,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_x a1, off, s[96:99], s4 offset:4095
+
+// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_x a1, off, s[12:15], s101 offset:4095
+
+// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_x a1, off, s[12:15], m0 offset:4095
+
+// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_x a1, off, s[12:15], 0 offset:4095
+
+// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_x a1, off, s[12:15], -1 offset:4095
+
+// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_x a1, off, s[12:15], 0.5 offset:4095
+
+// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_x a1, off, s[12:15], -4.0 offset:4095
+
+// GFX90A: buffer_store_format_d16_x a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x30,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_x a1, v0, s[12:15], s4 idxen offset:4095
+
+// GFX90A: buffer_store_format_d16_x a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x30,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_x a1, v0, s[12:15], s4 offen offset:4095
+
+// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_x a1, off, s[12:15], s4
+
+// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_x a1, off, s[12:15], s4
+
+// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x30,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_x a1, off, s[12:15], s4 offset:7
+
+// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x30,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_x a1, off, s[12:15], s4 offset:4095 glc
+
+// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x32,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_x a1, off, s[12:15], s4 offset:4095 slc
+
+// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_format_d16_xy a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0xff,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xy a255, off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_format_d16_xy a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x84,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xy a1, off, s[16:19], s4 offset:4095
+
+// GFX90A: buffer_store_format_d16_xy a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x98,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xy a1, off, s[96:99], s4 offset:4095
+
+// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xy a1, off, s[12:15], s101 offset:4095
+
+// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xy a1, off, s[12:15], m0 offset:4095
+
+// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xy a1, off, s[12:15], 0 offset:4095
+
+// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xy a1, off, s[12:15], -1 offset:4095
+
+// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xy a1, off, s[12:15], 0.5 offset:4095
+
+// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xy a1, off, s[12:15], -4.0 offset:4095
+
+// GFX90A: buffer_store_format_d16_xy a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x34,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xy a1, v0, s[12:15], s4 idxen offset:4095
+
+// GFX90A: buffer_store_format_d16_xy a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x34,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xy a1, v0, s[12:15], s4 offen offset:4095
+
+// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x34,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xy a1, off, s[12:15], s4
+
+// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x34,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xy a1, off, s[12:15], s4
+
+// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x34,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:7
+
+// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x34,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:4095 glc
+
+// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x36,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:4095 slc
+
+// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_format_d16_xyz a[254:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0xfe,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyz a[254:255], off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x84,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyz a[2:3], off, s[16:19], s4 offset:4095
+
+// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x98,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyz a[2:3], off, s[96:99], s4 offset:4095
+
+// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyz a[2:3], off, s[12:15], s101 offset:4095
+
+// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyz a[2:3], off, s[12:15], m0 offset:4095
+
+// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyz a[2:3], off, s[12:15], 0 offset:4095
+
+// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyz a[2:3], off, s[12:15], -1 offset:4095
+
+// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyz a[2:3], off, s[12:15], 0.5 offset:4095
+
+// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyz a[2:3], off, s[12:15], -4.0 offset:4095
+
+// GFX90A: buffer_store_format_d16_xyz a[2:3], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x38,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyz a[2:3], v0, s[12:15], s4 idxen offset:4095
+
+// GFX90A: buffer_store_format_d16_xyz a[2:3], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x38,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyz a[2:3], v0, s[12:15], s4 offen offset:4095
+
+// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x38,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4
+
+// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x38,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4
+
+// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x38,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:7
+
+// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x38,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:4095 glc
+
+// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x3a,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:4095 slc
+
+// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_format_d16_xyzw a[254:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0xfe,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyzw a[254:255], off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x84,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyzw a[2:3], off, s[16:19], s4 offset:4095
+
+// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x98,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyzw a[2:3], off, s[96:99], s4 offset:4095
+
+// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s101 offset:4095
+
+// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyzw a[2:3], off, s[12:15], m0 offset:4095
+
+// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyzw a[2:3], off, s[12:15], 0 offset:4095
+
+// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyzw a[2:3], off, s[12:15], -1 offset:4095
+
+// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyzw a[2:3], off, s[12:15], 0.5 offset:4095
+
+// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyzw a[2:3], off, s[12:15], -4.0 offset:4095
+
+// GFX90A: buffer_store_format_d16_xyzw a[2:3], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x3c,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyzw a[2:3], v0, s[12:15], s4 idxen offset:4095
+
+// GFX90A: buffer_store_format_d16_xyzw a[2:3], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x3c,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyzw a[2:3], v0, s[12:15], s4 offen offset:4095
+
+// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x3c,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4
+
+// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x3c,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4
+
+// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x3c,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:7
+
+// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x3c,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:4095 glc
+
+// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x3e,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:4095 slc
+
+// GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_ubyte a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_ubyte a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_ubyte a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_ubyte a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_ubyte a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_ubyte a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_ubyte a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_ubyte a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte a5, off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_ubyte a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte a5, off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_ubyte a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x40,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_ubyte a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x40,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x40,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x40,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x40,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x40,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x42,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x41,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 lds
+
+// GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_sbyte a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_sbyte a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_sbyte a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_sbyte a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_sbyte a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_sbyte a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_sbyte a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_sbyte a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte a5, off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_sbyte a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte a5, off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_sbyte a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x44,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_sbyte a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x44,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x44,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x44,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x44,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x44,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x46,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x45,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 lds
+
+// GFX90A: buffer_load_ushort a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ushort a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_ushort a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ushort a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_ushort a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ushort a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_ushort a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ushort a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_ushort a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ushort a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_ushort a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ushort a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_ushort a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ushort a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_ushort a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ushort a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_ushort a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ushort a5, off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_ushort a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ushort a5, off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_ushort a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x48,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ushort a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_ushort a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x48,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ushort a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_ushort a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ushort a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_ushort a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ushort a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_ushort a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x48,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ushort a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_ushort a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x48,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ushort a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_ushort a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x4a,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ushort a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_load_ushort a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x49,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ushort a5, off, s[8:11], s3 offset:4095 lds
+
+// GFX90A: buffer_load_sshort a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sshort a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_sshort a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sshort a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_sshort a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sshort a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_sshort a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sshort a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_sshort a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sshort a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_sshort a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sshort a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_sshort a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sshort a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_sshort a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sshort a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_sshort a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sshort a5, off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_sshort a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sshort a5, off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_sshort a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x4c,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sshort a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_sshort a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x4c,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sshort a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_sshort a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x4c,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sshort a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_sshort a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x4c,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sshort a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_sshort a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x4c,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sshort a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_sshort a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x4c,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sshort a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_sshort a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x4e,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sshort a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_load_sshort a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x4d,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sshort a5, off, s[8:11], s3 offset:4095 lds
+
+// GFX90A: buffer_load_dword a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dword a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_dword a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dword a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_dword a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dword a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_dword a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dword a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_dword a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dword a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_dword a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dword a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_dword a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dword a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_dword a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dword a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_dword a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dword a5, off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_dword a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dword a5, off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_dword a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x50,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dword a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_dword a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x50,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dword a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_dword a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x50,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dword a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_dword a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x50,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dword a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_dword a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x50,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dword a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_dword a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x50,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dword a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_dword a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x52,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dword a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_load_dword a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x51,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dword a5, off, s[8:11], s3 offset:4095 lds
+
+// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_dwordx2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0xfe,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx2 a[254:255], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_dwordx2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx2 a[6:7], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_dwordx2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx2 a[6:7], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx2 a[6:7], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx2 a[6:7], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx2 a[6:7], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx2 a[6:7], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx2 a[6:7], off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx2 a[6:7], off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_dwordx2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x54,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx2 a[6:7], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_dwordx2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x54,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx2 a[6:7], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x54,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x54,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x54,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x54,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x56,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_dwordx3 a[252:254], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0xfc,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx3 a[252:254], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_dwordx3 a[6:8], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx3 a[6:8], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_dwordx3 a[6:8], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx3 a[6:8], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx3 a[6:8], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx3 a[6:8], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx3 a[6:8], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx3 a[6:8], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx3 a[6:8], off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx3 a[6:8], off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_dwordx3 a[6:8], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x58,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx3 a[6:8], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_dwordx3 a[6:8], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x58,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx3 a[6:8], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 ; encoding: [0x00,0x00,0x58,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx3 a[6:8], off, s[8:11], s3
+
+// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 ; encoding: [0x00,0x00,0x58,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx3 a[6:8], off, s[8:11], s3
+
+// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x58,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x58,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x5a,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_dwordx4 a[252:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0xfc,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx4 a[252:255], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_dwordx4 a[6:9], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx4 a[6:9], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_dwordx4 a[6:9], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx4 a[6:9], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx4 a[6:9], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx4 a[6:9], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx4 a[6:9], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx4 a[6:9], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx4 a[6:9], off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx4 a[6:9], off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_dwordx4 a[6:9], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x5c,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx4 a[6:9], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_dwordx4 a[6:9], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x5c,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx4 a[6:9], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x5c,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx4 a[6:9], off, s[8:11], s3
+
+// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x5c,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx4 a[6:9], off, s[8:11], s3
+
+// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x5c,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x5c,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x5e,0xe0,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_store_byte a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte a1, off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_byte a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0xff,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte a255, off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_byte a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x84,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte a1, off, s[16:19], s4 offset:4095
+
+// GFX90A: buffer_store_byte a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x98,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte a1, off, s[96:99], s4 offset:4095
+
+// GFX90A: buffer_store_byte a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte a1, off, s[12:15], s101 offset:4095
+
+// GFX90A: buffer_store_byte a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte a1, off, s[12:15], m0 offset:4095
+
+// GFX90A: buffer_store_byte a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte a1, off, s[12:15], 0 offset:4095
+
+// GFX90A: buffer_store_byte a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte a1, off, s[12:15], -1 offset:4095
+
+// GFX90A: buffer_store_byte a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte a1, off, s[12:15], 0.5 offset:4095
+
+// GFX90A: buffer_store_byte a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte a1, off, s[12:15], -4.0 offset:4095
+
+// GFX90A: buffer_store_byte a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x60,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte a1, v0, s[12:15], s4 idxen offset:4095
+
+// GFX90A: buffer_store_byte a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x60,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte a1, v0, s[12:15], s4 offen offset:4095
+
+// GFX90A: buffer_store_byte a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x60,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte a1, off, s[12:15], s4
+
+// GFX90A: buffer_store_byte a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x60,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte a1, off, s[12:15], s4
+
+// GFX90A: buffer_store_byte a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x60,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte a1, off, s[12:15], s4 offset:7
+
+// GFX90A: buffer_store_byte a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x60,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte a1, off, s[12:15], s4 offset:4095 glc
+
+// GFX90A: buffer_store_byte a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x62,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte a1, off, s[12:15], s4 offset:4095 slc
+
+// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_byte_d16_hi a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0xff,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte_d16_hi a255, off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_byte_d16_hi a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x84,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte_d16_hi a1, off, s[16:19], s4 offset:4095
+
+// GFX90A: buffer_store_byte_d16_hi a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x98,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte_d16_hi a1, off, s[96:99], s4 offset:4095
+
+// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte_d16_hi a1, off, s[12:15], s101 offset:4095
+
+// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte_d16_hi a1, off, s[12:15], m0 offset:4095
+
+// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte_d16_hi a1, off, s[12:15], 0 offset:4095
+
+// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte_d16_hi a1, off, s[12:15], -1 offset:4095
+
+// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte_d16_hi a1, off, s[12:15], 0.5 offset:4095
+
+// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte_d16_hi a1, off, s[12:15], -4.0 offset:4095
+
+// GFX90A: buffer_store_byte_d16_hi a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x64,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte_d16_hi a1, v0, s[12:15], s4 idxen offset:4095
+
+// GFX90A: buffer_store_byte_d16_hi a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x64,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte_d16_hi a1, v0, s[12:15], s4 offen offset:4095
+
+// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte_d16_hi a1, off, s[12:15], s4
+
+// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte_d16_hi a1, off, s[12:15], s4
+
+// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x64,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:7
+
+// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x64,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:4095 glc
+
+// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x66,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:4095 slc
+
+// GFX90A: buffer_store_short a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short a1, off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_short a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0xff,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short a255, off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_short a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x84,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short a1, off, s[16:19], s4 offset:4095
+
+// GFX90A: buffer_store_short a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x98,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short a1, off, s[96:99], s4 offset:4095
+
+// GFX90A: buffer_store_short a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short a1, off, s[12:15], s101 offset:4095
+
+// GFX90A: buffer_store_short a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short a1, off, s[12:15], m0 offset:4095
+
+// GFX90A: buffer_store_short a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short a1, off, s[12:15], 0 offset:4095
+
+// GFX90A: buffer_store_short a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short a1, off, s[12:15], -1 offset:4095
+
+// GFX90A: buffer_store_short a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short a1, off, s[12:15], 0.5 offset:4095
+
+// GFX90A: buffer_store_short a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short a1, off, s[12:15], -4.0 offset:4095
+
+// GFX90A: buffer_store_short a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x68,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short a1, v0, s[12:15], s4 idxen offset:4095
+
+// GFX90A: buffer_store_short a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x68,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short a1, v0, s[12:15], s4 offen offset:4095
+
+// GFX90A: buffer_store_short a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short a1, off, s[12:15], s4
+
+// GFX90A: buffer_store_short a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short a1, off, s[12:15], s4
+
+// GFX90A: buffer_store_short a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x68,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short a1, off, s[12:15], s4 offset:7
+
+// GFX90A: buffer_store_short a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x68,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short a1, off, s[12:15], s4 offset:4095 glc
+
+// GFX90A: buffer_store_short a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x6a,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short a1, off, s[12:15], s4 offset:4095 slc
+
+// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_short_d16_hi a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0xff,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short_d16_hi a255, off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_short_d16_hi a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x84,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short_d16_hi a1, off, s[16:19], s4 offset:4095
+
+// GFX90A: buffer_store_short_d16_hi a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x98,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short_d16_hi a1, off, s[96:99], s4 offset:4095
+
+// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short_d16_hi a1, off, s[12:15], s101 offset:4095
+
+// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short_d16_hi a1, off, s[12:15], m0 offset:4095
+
+// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short_d16_hi a1, off, s[12:15], 0 offset:4095
+
+// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short_d16_hi a1, off, s[12:15], -1 offset:4095
+
+// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short_d16_hi a1, off, s[12:15], 0.5 offset:4095
+
+// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short_d16_hi a1, off, s[12:15], -4.0 offset:4095
+
+// GFX90A: buffer_store_short_d16_hi a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x6c,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short_d16_hi a1, v0, s[12:15], s4 idxen offset:4095
+
+// GFX90A: buffer_store_short_d16_hi a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x6c,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short_d16_hi a1, v0, s[12:15], s4 offen offset:4095
+
+// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x6c,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short_d16_hi a1, off, s[12:15], s4
+
+// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x6c,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short_d16_hi a1, off, s[12:15], s4
+
+// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x6c,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:7
+
+// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x6c,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:4095 glc
+
+// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x6e,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:4095 slc
+
+// GFX90A: buffer_store_dword a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dword a1, off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_dword a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0xff,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dword a255, off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_dword a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x84,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dword a1, off, s[16:19], s4 offset:4095
+
+// GFX90A: buffer_store_dword a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x98,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dword a1, off, s[96:99], s4 offset:4095
+
+// GFX90A: buffer_store_dword a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dword a1, off, s[12:15], s101 offset:4095
+
+// GFX90A: buffer_store_dword a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dword a1, off, s[12:15], m0 offset:4095
+
+// GFX90A: buffer_store_dword a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dword a1, off, s[12:15], 0 offset:4095
+
+// GFX90A: buffer_store_dword a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dword a1, off, s[12:15], -1 offset:4095
+
+// GFX90A: buffer_store_dword a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dword a1, off, s[12:15], 0.5 offset:4095
+
+// GFX90A: buffer_store_dword a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dword a1, off, s[12:15], -4.0 offset:4095
+
+// GFX90A: buffer_store_dword a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x70,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dword a1, v0, s[12:15], s4 idxen offset:4095
+
+// GFX90A: buffer_store_dword a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x70,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dword a1, v0, s[12:15], s4 offen offset:4095
+
+// GFX90A: buffer_store_dword a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dword a1, off, s[12:15], s4
+
+// GFX90A: buffer_store_dword a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dword a1, off, s[12:15], s4
+
+// GFX90A: buffer_store_dword a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x70,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dword a1, off, s[12:15], s4 offset:7
+
+// GFX90A: buffer_store_dword a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x70,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dword a1, off, s[12:15], s4 offset:4095 glc
+
+// GFX90A: buffer_store_dword a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x72,0xe0,0x00,0x01,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dword a1, off, s[12:15], s4 offset:4095 slc
+
+// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_dwordx2 a[254:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0xfe,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx2 a[254:255], off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_dwordx2 a[2:3], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x84,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx2 a[2:3], off, s[16:19], s4 offset:4095
+
+// GFX90A: buffer_store_dwordx2 a[2:3], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x98,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx2 a[2:3], off, s[96:99], s4 offset:4095
+
+// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx2 a[2:3], off, s[12:15], s101 offset:4095
+
+// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx2 a[2:3], off, s[12:15], m0 offset:4095
+
+// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx2 a[2:3], off, s[12:15], 0 offset:4095
+
+// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx2 a[2:3], off, s[12:15], -1 offset:4095
+
+// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx2 a[2:3], off, s[12:15], 0.5 offset:4095
+
+// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx2 a[2:3], off, s[12:15], -4.0 offset:4095
+
+// GFX90A: buffer_store_dwordx2 a[2:3], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x74,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx2 a[2:3], v0, s[12:15], s4 idxen offset:4095
+
+// GFX90A: buffer_store_dwordx2 a[2:3], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x74,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx2 a[2:3], v0, s[12:15], s4 offen offset:4095
+
+// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x74,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx2 a[2:3], off, s[12:15], s4
+
+// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x74,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx2 a[2:3], off, s[12:15], s4
+
+// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x74,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:7
+
+// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x74,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:4095 glc
+
+// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x76,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:4095 slc
+
+// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_dwordx3 a[252:254], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0xfc,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx3 a[252:254], off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_dwordx3 a[2:4], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x84,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx3 a[2:4], off, s[16:19], s4 offset:4095
+
+// GFX90A: buffer_store_dwordx3 a[2:4], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x98,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx3 a[2:4], off, s[96:99], s4 offset:4095
+
+// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx3 a[2:4], off, s[12:15], s101 offset:4095
+
+// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx3 a[2:4], off, s[12:15], m0 offset:4095
+
+// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx3 a[2:4], off, s[12:15], 0 offset:4095
+
+// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx3 a[2:4], off, s[12:15], -1 offset:4095
+
+// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx3 a[2:4], off, s[12:15], 0.5 offset:4095
+
+// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx3 a[2:4], off, s[12:15], -4.0 offset:4095
+
+// GFX90A: buffer_store_dwordx3 a[2:4], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x78,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx3 a[2:4], v0, s[12:15], s4 idxen offset:4095
+
+// GFX90A: buffer_store_dwordx3 a[2:4], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x78,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx3 a[2:4], v0, s[12:15], s4 offen offset:4095
+
+// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 ; encoding: [0x00,0x00,0x78,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx3 a[2:4], off, s[12:15], s4
+
+// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 ; encoding: [0x00,0x00,0x78,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx3 a[2:4], off, s[12:15], s4
+
+// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x78,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:7
+
+// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x78,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:4095 glc
+
+// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x7a,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:4095 slc
+
+// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_dwordx4 a[252:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0xfc,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx4 a[252:255], off, s[12:15], s4 offset:4095
+
+// GFX90A: buffer_store_dwordx4 a[2:5], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x84,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx4 a[2:5], off, s[16:19], s4 offset:4095
+
+// GFX90A: buffer_store_dwordx4 a[2:5], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x98,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx4 a[2:5], off, s[96:99], s4 offset:4095
+
+// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx4 a[2:5], off, s[12:15], s101 offset:4095
+
+// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx4 a[2:5], off, s[12:15], m0 offset:4095
+
+// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx4 a[2:5], off, s[12:15], 0 offset:4095
+
+// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx4 a[2:5], off, s[12:15], -1 offset:4095
+
+// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx4 a[2:5], off, s[12:15], 0.5 offset:4095
+
+// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx4 a[2:5], off, s[12:15], -4.0 offset:4095
+
+// GFX90A: buffer_store_dwordx4 a[2:5], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x7c,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx4 a[2:5], v0, s[12:15], s4 idxen offset:4095
+
+// GFX90A: buffer_store_dwordx4 a[2:5], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x7c,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx4 a[2:5], v0, s[12:15], s4 offen offset:4095
+
+// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 ; encoding: [0x00,0x00,0x7c,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx4 a[2:5], off, s[12:15], s4
+
+// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 ; encoding: [0x00,0x00,0x7c,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx4 a[2:5], off, s[12:15], s4
+
+// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x7c,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:7
+
+// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x7c,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:4095 glc
+
+// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x7e,0xe0,0x00,0x02,0x83,0x04]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:4095 slc
+
+// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_ubyte_d16 a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16 a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_ubyte_d16 a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16 a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_ubyte_d16 a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16 a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16 a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16 a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16 a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16 a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16 a5, off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16 a5, off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_ubyte_d16 a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x80,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16 a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_ubyte_d16 a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x80,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16 a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x80,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16 a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x80,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16 a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x80,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x80,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x82,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_ubyte_d16_hi a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16_hi a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16_hi a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16_hi a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16_hi a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16_hi a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16_hi a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16_hi a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16_hi a5, off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16_hi a5, off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_ubyte_d16_hi a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x84,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16_hi a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_ubyte_d16_hi a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x84,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16_hi a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x84,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16_hi a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x84,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16_hi a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x84,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x84,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x86,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_sbyte_d16 a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16 a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_sbyte_d16 a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16 a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_sbyte_d16 a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16 a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16 a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16 a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16 a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16 a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16 a5, off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16 a5, off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_sbyte_d16 a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x88,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16 a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_sbyte_d16 a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x88,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16 a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x88,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16 a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x88,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16 a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x88,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x88,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x8a,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_sbyte_d16_hi a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16_hi a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16_hi a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16_hi a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16_hi a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16_hi a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16_hi a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16_hi a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16_hi a5, off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16_hi a5, off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_sbyte_d16_hi a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x8c,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16_hi a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_sbyte_d16_hi a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x8c,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16_hi a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x8c,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16_hi a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x8c,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16_hi a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x8c,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x8c,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x8e,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16 a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_short_d16 a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16 a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_short_d16 a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16 a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_short_d16 a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16 a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_short_d16 a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16 a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_short_d16 a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16 a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_short_d16 a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16 a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_short_d16 a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16 a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_short_d16 a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16 a5, off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_short_d16 a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16 a5, off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_short_d16 a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x90,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16 a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_short_d16 a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x90,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16 a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x90,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16 a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x90,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16 a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x90,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16 a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x90,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16 a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x92,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16 a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_short_d16_hi a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16_hi a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_load_short_d16_hi a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16_hi a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_load_short_d16_hi a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16_hi a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16_hi a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16_hi a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16_hi a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16_hi a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0xf0]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16_hi a5, off, s[8:11], 0.5 offset:4095
+
+// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0xf7]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16_hi a5, off, s[8:11], -4.0 offset:4095
+
+// GFX90A: buffer_load_short_d16_hi a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x94,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16_hi a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_load_short_d16_hi a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x94,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16_hi a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x94,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16_hi a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x94,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16_hi a5, off, s[8:11], s3
+
+// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x94,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x94,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x96,0xe0,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_swap a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_swap a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_swap a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_swap a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_swap a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_swap a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_swap a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_swap a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x00,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_swap a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x00,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x00,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x00,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x00,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x00,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x02,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_cmpswap a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0xfe,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap a[254:255], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap a[6:7], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap a[6:7], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap a[6:7], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap a[6:7], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap a[6:7], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap a[6:7], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_cmpswap a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x04,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap a[6:7], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_cmpswap a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x04,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap a[6:7], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x04,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x04,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x04,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x04,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x06,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_add a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_add a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_add a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_add a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_add a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_add a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_add a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_add a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_add a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x08,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_add a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x08,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_add a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x08,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_add a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x08,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_add a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x08,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_add a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x08,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_add a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x0a,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_sub a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_sub a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_sub a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_sub a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_sub a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_sub a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_sub a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_sub a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x0c,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_sub a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x0c,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x0c,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x0c,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x0c,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x0c,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x0e,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_smin a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_smin a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_smin a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_smin a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_smin a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_smin a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_smin a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_smin a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x10,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_smin a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x10,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x10,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x10,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x10,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x10,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x12,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_umin a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_umin a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_umin a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_umin a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_umin a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_umin a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_umin a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_umin a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x14,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_umin a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x14,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x14,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x14,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x14,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x14,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x16,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_smax a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_smax a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_smax a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_smax a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_smax a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_smax a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_smax a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_smax a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x18,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_smax a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x18,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x18,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x18,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x18,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x18,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x1a,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_umax a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_umax a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_umax a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_umax a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_umax a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_umax a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_umax a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_umax a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x1c,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_umax a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x1c,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x1c,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x1c,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x1c,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x1c,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x1e,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_and a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_and a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_and a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_and a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_and a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_and a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_and a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_and a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_and a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x20,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_and a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x20,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_and a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x20,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_and a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x20,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_and a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x20,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_and a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x20,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_and a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x22,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_or a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_or a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_or a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_or a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_or a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_or a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_or a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_or a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_or a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x24,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_or a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x24,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_or a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x24,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_or a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x24,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_or a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x24,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_or a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x24,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_or a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x26,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_xor a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_xor a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_xor a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_xor a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_xor a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_xor a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_xor a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_xor a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x28,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_xor a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x28,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x28,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x28,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x28,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x28,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x2a,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_inc a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_inc a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_inc a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_inc a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_inc a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_inc a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_inc a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_inc a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x2c,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_inc a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x2c,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x2c,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x2c,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x2c,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x2c,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x2e,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec a5, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_dec a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0xff,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec a255, off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_dec a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec a5, off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_dec a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec a5, off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_dec a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec a5, off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_dec a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec a5, off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_dec a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec a5, off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_dec a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec a5, off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_dec a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x30,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec a5, v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_dec a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x30,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec a5, v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x30,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x30,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec a5, off, s[8:11], s3
+
+// GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x30,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec a5, off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x30,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec a5, off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x32,0xe1,0x00,0x05,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec a5, off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_swap_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0xfe,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap_x2 a[254:255], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap_x2 a[6:7], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap_x2 a[6:7], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap_x2 a[6:7], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap_x2 a[6:7], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap_x2 a[6:7], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap_x2 a[6:7], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_swap_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x80,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_swap_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x80,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap_x2 a[6:7], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x80,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x80,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x80,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x80,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x82,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_cmpswap_x2 a[252:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0xfc,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap_x2 a[252:255], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap_x2 a[6:9], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap_x2 a[6:9], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x84,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap_x2 a[6:9], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x84,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap_x2 a[6:9], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x84,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x84,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x84,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x84,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x86,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_add_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0xfe,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add_x2 a[254:255], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add_x2 a[6:7], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add_x2 a[6:7], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add_x2 a[6:7], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add_x2 a[6:7], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add_x2 a[6:7], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add_x2 a[6:7], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_add_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x88,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_add_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x88,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add_x2 a[6:7], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x88,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x88,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x88,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x88,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x8a,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_sub_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0xfe,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub_x2 a[254:255], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub_x2 a[6:7], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub_x2 a[6:7], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub_x2 a[6:7], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub_x2 a[6:7], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub_x2 a[6:7], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub_x2 a[6:7], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_sub_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x8c,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_sub_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x8c,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub_x2 a[6:7], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x8c,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x8c,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x8c,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x8c,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x8e,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_smin_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0xfe,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin_x2 a[254:255], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin_x2 a[6:7], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin_x2 a[6:7], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin_x2 a[6:7], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin_x2 a[6:7], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin_x2 a[6:7], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin_x2 a[6:7], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_smin_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x90,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_smin_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x90,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin_x2 a[6:7], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x90,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x90,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x90,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x90,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x92,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_umin_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0xfe,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin_x2 a[254:255], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin_x2 a[6:7], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin_x2 a[6:7], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin_x2 a[6:7], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin_x2 a[6:7], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin_x2 a[6:7], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin_x2 a[6:7], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_umin_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x94,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_umin_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x94,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin_x2 a[6:7], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x94,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x94,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x94,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x94,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x96,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_smax_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0xfe,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax_x2 a[254:255], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax_x2 a[6:7], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax_x2 a[6:7], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax_x2 a[6:7], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax_x2 a[6:7], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax_x2 a[6:7], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax_x2 a[6:7], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_smax_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x98,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_smax_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x98,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax_x2 a[6:7], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x98,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x98,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x98,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x98,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x9a,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_umax_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0xfe,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax_x2 a[254:255], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax_x2 a[6:7], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax_x2 a[6:7], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax_x2 a[6:7], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax_x2 a[6:7], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax_x2 a[6:7], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax_x2 a[6:7], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_umax_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x9c,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_umax_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x9c,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax_x2 a[6:7], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x9c,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x9c,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x9c,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x9c,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x9e,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_and_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0xfe,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and_x2 a[254:255], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and_x2 a[6:7], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and_x2 a[6:7], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and_x2 a[6:7], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and_x2 a[6:7], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and_x2 a[6:7], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and_x2 a[6:7], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_and_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0xa0,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_and_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0xa0,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and_x2 a[6:7], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa0,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa0,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0xa0,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xa0,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0xa2,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_or_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0xfe,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or_x2 a[254:255], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or_x2 a[6:7], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or_x2 a[6:7], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or_x2 a[6:7], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or_x2 a[6:7], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or_x2 a[6:7], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or_x2 a[6:7], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_or_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0xa4,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_or_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0xa4,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or_x2 a[6:7], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa4,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa4,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0xa4,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xa4,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0xa6,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_xor_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0xfe,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor_x2 a[254:255], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor_x2 a[6:7], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor_x2 a[6:7], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor_x2 a[6:7], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor_x2 a[6:7], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor_x2 a[6:7], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor_x2 a[6:7], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_xor_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0xa8,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_xor_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0xa8,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor_x2 a[6:7], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa8,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa8,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0xa8,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xa8,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0xaa,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_inc_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0xfe,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc_x2 a[254:255], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc_x2 a[6:7], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc_x2 a[6:7], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc_x2 a[6:7], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc_x2 a[6:7], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc_x2 a[6:7], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc_x2 a[6:7], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_inc_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0xac,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_inc_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0xac,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc_x2 a[6:7], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xac,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xac,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0xac,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xac,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0xae,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_dec_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0xfe,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec_x2 a[254:255], off, s[8:11], s3 offset:4095
+
+// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x83,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec_x2 a[6:7], off, s[12:15], s3 offset:4095
+
+// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x98,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec_x2 a[6:7], off, s[96:99], s3 offset:4095
+
+// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x65]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec_x2 a[6:7], off, s[8:11], s101 offset:4095
+
+// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x7c]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec_x2 a[6:7], off, s[8:11], m0 offset:4095
+
+// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x80]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec_x2 a[6:7], off, s[8:11], 0 offset:4095
+
+// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0xc1]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec_x2 a[6:7], off, s[8:11], -1 offset:4095
+
+// GFX90A: buffer_atomic_dec_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0xb0,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095
+
+// GFX90A: buffer_atomic_dec_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0xb0,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec_x2 a[6:7], v0, s[8:11], s3 offen offset:4095
+
+// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xb0,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xb0,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3
+
+// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0xb0,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:7
+
+// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xb0,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:4095 glc
+
+// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0xb2,0xe1,0x00,0x06,0x82,0x03]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:4095 slc
+
+// GFX90A: tbuffer_load_format_x a1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x78,0xe9,0x00,0x01,0x81,0x01]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+tbuffer_load_format_x a1, off, s[4:7],  dfmt:15, nfmt:2, s1
+
+// GFX90A: tbuffer_load_format_xy a[2:3], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x78,0xe9,0x00,0x02,0x81,0x01]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+tbuffer_load_format_xy a[2:3], off, s[4:7],  dfmt:15, nfmt:2, s1
+
+// GFX90A: tbuffer_load_format_xyz a[2:4], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x79,0xe9,0x00,0x02,0x81,0x01]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+tbuffer_load_format_xyz a[2:4], off, s[4:7],  dfmt:15, nfmt:2, s1
+
+// GFX90A: tbuffer_load_format_xyzw a[2:5], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x79,0xe9,0x00,0x02,0x81,0x01]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+tbuffer_load_format_xyzw a[2:5], off, s[4:7],  dfmt:15, nfmt:2, s1
+
+// GFX90A: tbuffer_store_format_x a1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7a,0xe9,0x00,0x01,0x81,0x01]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+tbuffer_store_format_x a1, off, s[4:7],  dfmt:15, nfmt:2, s1
+
+// GFX90A: tbuffer_store_format_xy a[2:3], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7a,0xe9,0x00,0x02,0x81,0x01]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+tbuffer_store_format_xy a[2:3], off, s[4:7],  dfmt:15, nfmt:2, s1
+
+// GFX90A: tbuffer_store_format_xyzw a[2:5], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x02,0x81,0x01]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+tbuffer_store_format_xyzw a[2:5], off, s[4:7],  dfmt:15, nfmt:2, s1
+
+// GFX90A: tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x02,0x9c,0x6d]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7],  dfmt:15, nfmt:2, ttmp1
+
+// GFX90A: tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15] ; encoding: [0x00,0x80,0x7b,0xe8,0x00,0x02,0x9c,0x6d]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7],  dfmt:15, nfmt:0, ttmp1
+
+// GFX90A: tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x03,0xe9,0x00,0x02,0x9c,0x6d]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7],  dfmt:0, nfmt:2, ttmp1
+
+// GFX90A: tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x02,0x9c,0x6d]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7],  dfmt:15, nfmt:2, ttmp1
+
+// GFX90A: ds_add_u32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x00,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_u32 v1, a2 offset:65535
+
+// GFX90A: ds_add_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x00,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_u32 v255, a2 offset:65535
+
+// GFX90A: ds_add_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x00,0xda,0x01,0xff,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_u32 v1, a255 offset:65535
+
+// GFX90A: ds_add_u32 v1, a2               ; encoding: [0x00,0x00,0x00,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_u32 v1, a2
+
+// GFX90A: ds_add_u32 v1, a2               ; encoding: [0x00,0x00,0x00,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_u32 v1, a2
+
+// GFX90A: ds_add_u32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x00,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_u32 v1, a2 offset:4
+
+// GFX90A: ds_add_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x01,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_u32 v1, a2 offset:65535 gds
+
+// GFX90A: ds_sub_u32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x02,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_u32 v1, a2 offset:65535
+
+// GFX90A: ds_sub_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x02,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_u32 v255, a2 offset:65535
+
+// GFX90A: ds_sub_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x02,0xda,0x01,0xff,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_u32 v1, a255 offset:65535
+
+// GFX90A: ds_sub_u32 v1, a2               ; encoding: [0x00,0x00,0x02,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_u32 v1, a2
+
+// GFX90A: ds_sub_u32 v1, a2               ; encoding: [0x00,0x00,0x02,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_u32 v1, a2
+
+// GFX90A: ds_sub_u32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x02,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_u32 v1, a2 offset:4
+
+// GFX90A: ds_sub_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x03,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_u32 v1, a2 offset:65535 gds
+
+// GFX90A: ds_rsub_u32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x04,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_u32 v1, a2 offset:65535
+
+// GFX90A: ds_rsub_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x04,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_u32 v255, a2 offset:65535
+
+// GFX90A: ds_rsub_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x04,0xda,0x01,0xff,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_u32 v1, a255 offset:65535
+
+// GFX90A: ds_rsub_u32 v1, a2              ; encoding: [0x00,0x00,0x04,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_u32 v1, a2
+
+// GFX90A: ds_rsub_u32 v1, a2              ; encoding: [0x00,0x00,0x04,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_u32 v1, a2
+
+// GFX90A: ds_rsub_u32 v1, a2 offset:4     ; encoding: [0x04,0x00,0x04,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_u32 v1, a2 offset:4
+
+// GFX90A: ds_rsub_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x05,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_u32 v1, a2 offset:65535 gds
+
+// GFX90A: ds_inc_u32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x06,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_u32 v1, a2 offset:65535
+
+// GFX90A: ds_inc_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x06,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_u32 v255, a2 offset:65535
+
+// GFX90A: ds_inc_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x06,0xda,0x01,0xff,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_u32 v1, a255 offset:65535
+
+// GFX90A: ds_inc_u32 v1, a2               ; encoding: [0x00,0x00,0x06,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_u32 v1, a2
+
+// GFX90A: ds_inc_u32 v1, a2               ; encoding: [0x00,0x00,0x06,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_u32 v1, a2
+
+// GFX90A: ds_inc_u32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x06,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_u32 v1, a2 offset:4
+
+// GFX90A: ds_inc_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x07,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_u32 v1, a2 offset:65535 gds
+
+// GFX90A: ds_dec_u32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x08,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_u32 v1, a2 offset:65535
+
+// GFX90A: ds_dec_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x08,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_u32 v255, a2 offset:65535
+
+// GFX90A: ds_dec_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x08,0xda,0x01,0xff,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_u32 v1, a255 offset:65535
+
+// GFX90A: ds_dec_u32 v1, a2               ; encoding: [0x00,0x00,0x08,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_u32 v1, a2
+
+// GFX90A: ds_dec_u32 v1, a2               ; encoding: [0x00,0x00,0x08,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_u32 v1, a2
+
+// GFX90A: ds_dec_u32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x08,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_u32 v1, a2 offset:4
+
+// GFX90A: ds_dec_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x09,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_u32 v1, a2 offset:65535 gds
+
+// GFX90A: ds_min_i32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x0a,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_i32 v1, a2 offset:65535
+
+// GFX90A: ds_min_i32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x0a,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_i32 v255, a2 offset:65535
+
+// GFX90A: ds_min_i32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x0a,0xda,0x01,0xff,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_i32 v1, a255 offset:65535
+
+// GFX90A: ds_min_i32 v1, a2               ; encoding: [0x00,0x00,0x0a,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_i32 v1, a2
+
+// GFX90A: ds_min_i32 v1, a2               ; encoding: [0x00,0x00,0x0a,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_i32 v1, a2
+
+// GFX90A: ds_min_i32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x0a,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_i32 v1, a2 offset:4
+
+// GFX90A: ds_min_i32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x0b,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_i32 v1, a2 offset:65535 gds
+
+// GFX90A: ds_max_i32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x0c,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_i32 v1, a2 offset:65535
+
+// GFX90A: ds_max_i32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x0c,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_i32 v255, a2 offset:65535
+
+// GFX90A: ds_max_i32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x0c,0xda,0x01,0xff,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_i32 v1, a255 offset:65535
+
+// GFX90A: ds_max_i32 v1, a2               ; encoding: [0x00,0x00,0x0c,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_i32 v1, a2
+
+// GFX90A: ds_max_i32 v1, a2               ; encoding: [0x00,0x00,0x0c,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_i32 v1, a2
+
+// GFX90A: ds_max_i32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x0c,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_i32 v1, a2 offset:4
+
+// GFX90A: ds_max_i32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x0d,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_i32 v1, a2 offset:65535 gds
+
+// GFX90A: ds_min_u32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x0e,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_u32 v1, a2 offset:65535
+
+// GFX90A: ds_min_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x0e,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_u32 v255, a2 offset:65535
+
+// GFX90A: ds_min_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x0e,0xda,0x01,0xff,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_u32 v1, a255 offset:65535
+
+// GFX90A: ds_min_u32 v1, a2               ; encoding: [0x00,0x00,0x0e,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_u32 v1, a2
+
+// GFX90A: ds_min_u32 v1, a2               ; encoding: [0x00,0x00,0x0e,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_u32 v1, a2
+
+// GFX90A: ds_min_u32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x0e,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_u32 v1, a2 offset:4
+
+// GFX90A: ds_min_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x0f,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_u32 v1, a2 offset:65535 gds
+
+// GFX90A: ds_max_u32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x10,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_u32 v1, a2 offset:65535
+
+// GFX90A: ds_max_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x10,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_u32 v255, a2 offset:65535
+
+// GFX90A: ds_max_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x10,0xda,0x01,0xff,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_u32 v1, a255 offset:65535
+
+// GFX90A: ds_max_u32 v1, a2               ; encoding: [0x00,0x00,0x10,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_u32 v1, a2
+
+// GFX90A: ds_max_u32 v1, a2               ; encoding: [0x00,0x00,0x10,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_u32 v1, a2
+
+// GFX90A: ds_max_u32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x10,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_u32 v1, a2 offset:4
+
+// GFX90A: ds_max_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x11,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_u32 v1, a2 offset:65535 gds
+
+// GFX90A: ds_and_b32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x12,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_b32 v1, a2 offset:65535
+
+// GFX90A: ds_and_b32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x12,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_b32 v255, a2 offset:65535
+
+// GFX90A: ds_and_b32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x12,0xda,0x01,0xff,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_b32 v1, a255 offset:65535
+
+// GFX90A: ds_and_b32 v1, a2               ; encoding: [0x00,0x00,0x12,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_b32 v1, a2
+
+// GFX90A: ds_and_b32 v1, a2               ; encoding: [0x00,0x00,0x12,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_b32 v1, a2
+
+// GFX90A: ds_and_b32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x12,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_b32 v1, a2 offset:4
+
+// GFX90A: ds_and_b32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x13,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_b32 v1, a2 offset:65535 gds
+
+// GFX90A: ds_or_b32 v1, a2 offset:65535   ; encoding: [0xff,0xff,0x14,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_b32 v1, a2 offset:65535
+
+// GFX90A: ds_or_b32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x14,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_b32 v255, a2 offset:65535
+
+// GFX90A: ds_or_b32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x14,0xda,0x01,0xff,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_b32 v1, a255 offset:65535
+
+// GFX90A: ds_or_b32 v1, a2                ; encoding: [0x00,0x00,0x14,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_b32 v1, a2
+
+// GFX90A: ds_or_b32 v1, a2                ; encoding: [0x00,0x00,0x14,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_b32 v1, a2
+
+// GFX90A: ds_or_b32 v1, a2 offset:4       ; encoding: [0x04,0x00,0x14,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_b32 v1, a2 offset:4
+
+// GFX90A: ds_or_b32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x15,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_b32 v1, a2 offset:65535 gds
+
+// GFX90A: ds_xor_b32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x16,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_b32 v1, a2 offset:65535
+
+// GFX90A: ds_xor_b32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x16,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_b32 v255, a2 offset:65535
+
+// GFX90A: ds_xor_b32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x16,0xda,0x01,0xff,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_b32 v1, a255 offset:65535
+
+// GFX90A: ds_xor_b32 v1, a2               ; encoding: [0x00,0x00,0x16,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_b32 v1, a2
+
+// GFX90A: ds_xor_b32 v1, a2               ; encoding: [0x00,0x00,0x16,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_b32 v1, a2
+
+// GFX90A: ds_xor_b32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x16,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_b32 v1, a2 offset:4
+
+// GFX90A: ds_xor_b32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x17,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_b32 v1, a2 offset:65535 gds
+
+// GFX90A: ds_mskor_b32 v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x18,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_b32 v1, a2, a3 offset:65535
+
+// GFX90A: ds_mskor_b32 v255, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x18,0xda,0xff,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_b32 v255, a2, a3 offset:65535
+
+// GFX90A: ds_mskor_b32 v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x18,0xda,0x01,0xff,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_b32 v1, a255, a3 offset:65535
+
+// GFX90A: ds_mskor_b32 v1, a2, a255 offset:65535 ; encoding: [0xff,0xff,0x18,0xda,0x01,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_b32 v1, a2, a255 offset:65535
+
+// GFX90A: ds_mskor_b32 v1, a2, a3         ; encoding: [0x00,0x00,0x18,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_b32 v1, a2, a3
+
+// GFX90A: ds_mskor_b32 v1, a2, a3         ; encoding: [0x00,0x00,0x18,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_b32 v1, a2, a3
+
+// GFX90A: ds_mskor_b32 v1, a2, a3 offset:4 ; encoding: [0x04,0x00,0x18,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_b32 v1, a2, a3 offset:4
+
+// GFX90A: ds_mskor_b32 v1, a2, a3 offset:65535 gds ; encoding: [0xff,0xff,0x19,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_b32 v1, a2, a3 offset:65535 gds
+
+// GFX90A: ds_write_b32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x1a,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b32 v1, a2 offset:65535
+
+// GFX90A: ds_write_b32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x1a,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b32 v255, a2 offset:65535
+
+// GFX90A: ds_write_b32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x1a,0xda,0x01,0xff,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b32 v1, a255 offset:65535
+
+// GFX90A: ds_write_b32 v1, a2             ; encoding: [0x00,0x00,0x1a,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b32 v1, a2
+
+// GFX90A: ds_write_b32 v1, a2             ; encoding: [0x00,0x00,0x1a,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b32 v1, a2
+
+// GFX90A: ds_write_b32 v1, a2 offset:4    ; encoding: [0x04,0x00,0x1a,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b32 v1, a2 offset:4
+
+// GFX90A: ds_write_b32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x1b,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b32 v1, a2 offset:65535 gds
+
+// GFX90A: ds_write2_b32 v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1c,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b32 v1, a2, a3 offset0:127 offset1:255
+
+// GFX90A: ds_write2_b32 v255, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1c,0xda,0xff,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b32 v255, a2, a3 offset0:127 offset1:255
+
+// GFX90A: ds_write2_b32 v1, a255, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1c,0xda,0x01,0xff,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b32 v1, a255, a3 offset0:127 offset1:255
+
+// GFX90A: ds_write2_b32 v1, a2, a255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1c,0xda,0x01,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b32 v1, a2, a255 offset0:127 offset1:255
+
+// GFX90A: ds_write2_b32 v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x1c,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b32 v1, a2, a3 offset1:255
+
+// GFX90A: ds_write2_b32 v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x1c,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b32 v1, a2, a3 offset1:255
+
+// GFX90A: ds_write2_b32 v1, a2, a3 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x1c,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b32 v1, a2, a3 offset0:16 offset1:255
+
+// GFX90A: ds_write2_b32 v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x1c,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b32 v1, a2, a3 offset0:127
+
+// GFX90A: ds_write2_b32 v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x1c,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b32 v1, a2, a3 offset0:127
+
+// GFX90A: ds_write2_b32 v1, a2, a3 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x1c,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b32 v1, a2, a3 offset0:127 offset1:1
+
+// GFX90A: ds_write2_b32 v1, a2, a3 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x1d,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b32 v1, a2, a3 offset0:127 offset1:255 gds
+
+// GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1e,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b32 v1, a2, a3 offset0:127 offset1:255
+
+// GFX90A: ds_write2st64_b32 v255, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1e,0xda,0xff,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b32 v255, a2, a3 offset0:127 offset1:255
+
+// GFX90A: ds_write2st64_b32 v1, a255, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1e,0xda,0x01,0xff,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b32 v1, a255, a3 offset0:127 offset1:255
+
+// GFX90A: ds_write2st64_b32 v1, a2, a255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1e,0xda,0x01,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b32 v1, a2, a255 offset0:127 offset1:255
+
+// GFX90A: ds_write2st64_b32 v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x1e,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b32 v1, a2, a3 offset1:255
+
+// GFX90A: ds_write2st64_b32 v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x1e,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b32 v1, a2, a3 offset1:255
+
+// GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x1e,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b32 v1, a2, a3 offset0:16 offset1:255
+
+// GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x1e,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b32 v1, a2, a3 offset0:127
+
+// GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x1e,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b32 v1, a2, a3 offset0:127
+
+// GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x1e,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b32 v1, a2, a3 offset0:127 offset1:1
+
+// GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x1f,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b32 v1, a2, a3 offset0:127 offset1:255 gds
+
+// GFX90A: ds_cmpst_b32 v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x20,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_b32 v1, a2, a3 offset:65535
+
+// GFX90A: ds_cmpst_b32 v255, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x20,0xda,0xff,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_b32 v255, a2, a3 offset:65535
+
+// GFX90A: ds_cmpst_b32 v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x20,0xda,0x01,0xff,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_b32 v1, a255, a3 offset:65535
+
+// GFX90A: ds_cmpst_b32 v1, a2, a255 offset:65535 ; encoding: [0xff,0xff,0x20,0xda,0x01,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_b32 v1, a2, a255 offset:65535
+
+// GFX90A: ds_cmpst_b32 v1, a2, a3         ; encoding: [0x00,0x00,0x20,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_b32 v1, a2, a3
+
+// GFX90A: ds_cmpst_b32 v1, a2, a3         ; encoding: [0x00,0x00,0x20,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_b32 v1, a2, a3
+
+// GFX90A: ds_cmpst_b32 v1, a2, a3 offset:4 ; encoding: [0x04,0x00,0x20,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_b32 v1, a2, a3 offset:4
+
+// GFX90A: ds_cmpst_b32 v1, a2, a3 offset:65535 gds ; encoding: [0xff,0xff,0x21,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_b32 v1, a2, a3 offset:65535 gds
+
+// GFX90A: ds_cmpst_f32 v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x22,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_f32 v1, a2, a3 offset:65535
+
+// GFX90A: ds_cmpst_f32 v255, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x22,0xda,0xff,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_f32 v255, a2, a3 offset:65535
+
+// GFX90A: ds_cmpst_f32 v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x22,0xda,0x01,0xff,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_f32 v1, a255, a3 offset:65535
+
+// GFX90A: ds_cmpst_f32 v1, a2, a255 offset:65535 ; encoding: [0xff,0xff,0x22,0xda,0x01,0x02,0xff,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_f32 v1, a2, a255 offset:65535
+
+// GFX90A: ds_cmpst_f32 v1, a2, a3         ; encoding: [0x00,0x00,0x22,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_f32 v1, a2, a3
+
+// GFX90A: ds_cmpst_f32 v1, a2, a3         ; encoding: [0x00,0x00,0x22,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_f32 v1, a2, a3
+
+// GFX90A: ds_cmpst_f32 v1, a2, a3 offset:4 ; encoding: [0x04,0x00,0x22,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_f32 v1, a2, a3 offset:4
+
+// GFX90A: ds_cmpst_f32 v1, a2, a3 offset:65535 gds ; encoding: [0xff,0xff,0x23,0xda,0x01,0x02,0x03,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_f32 v1, a2, a3 offset:65535 gds
+
+// GFX90A: ds_min_f32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x24,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_f32 v1, a2 offset:65535
+
+// GFX90A: ds_min_f32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x24,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_f32 v255, a2 offset:65535
+
+// GFX90A: ds_min_f32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x24,0xda,0x01,0xff,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_f32 v1, a255 offset:65535
+
+// GFX90A: ds_min_f32 v1, a2               ; encoding: [0x00,0x00,0x24,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_f32 v1, a2
+
+// GFX90A: ds_min_f32 v1, a2               ; encoding: [0x00,0x00,0x24,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_f32 v1, a2
+
+// GFX90A: ds_min_f32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x24,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_f32 v1, a2 offset:4
+
+// GFX90A: ds_min_f32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x25,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_f32 v1, a2 offset:65535 gds
+
+// GFX90A: ds_max_f32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x26,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_f32 v1, a2 offset:65535
+
+// GFX90A: ds_max_f32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x26,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_f32 v255, a2 offset:65535
+
+// GFX90A: ds_max_f32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x26,0xda,0x01,0xff,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_f32 v1, a255 offset:65535
+
+// GFX90A: ds_max_f32 v1, a2               ; encoding: [0x00,0x00,0x26,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_f32 v1, a2
+
+// GFX90A: ds_max_f32 v1, a2               ; encoding: [0x00,0x00,0x26,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_f32 v1, a2
+
+// GFX90A: ds_max_f32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x26,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_f32 v1, a2 offset:4
+
+// GFX90A: ds_max_f32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x27,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_f32 v1, a2 offset:65535 gds
+
+// GFX90A: ds_add_f32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x2a,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_f32 v1, a2 offset:65535
+
+// GFX90A: ds_add_f32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x2a,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_f32 v255, a2 offset:65535
+
+// GFX90A: ds_add_f32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x2a,0xda,0x01,0xff,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_f32 v1, a255 offset:65535
+
+// GFX90A: ds_add_f32 v1, a2               ; encoding: [0x00,0x00,0x2a,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_f32 v1, a2
+
+// GFX90A: ds_add_f32 v1, a2               ; encoding: [0x00,0x00,0x2a,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_f32 v1, a2
+
+// GFX90A: ds_add_f32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x2a,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_f32 v1, a2 offset:4
+
+// GFX90A: ds_add_f32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x2b,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_f32 v1, a2 offset:65535 gds
+
+// GFX90A: ds_write_b8 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x3c,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b8 v1, a2 offset:65535
+
+// GFX90A: ds_write_b8 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x3c,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b8 v255, a2 offset:65535
+
+// GFX90A: ds_write_b8 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x3c,0xda,0x01,0xff,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b8 v1, a255 offset:65535
+
+// GFX90A: ds_write_b8 v1, a2              ; encoding: [0x00,0x00,0x3c,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b8 v1, a2
+
+// GFX90A: ds_write_b8 v1, a2              ; encoding: [0x00,0x00,0x3c,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b8 v1, a2
+
+// GFX90A: ds_write_b8 v1, a2 offset:4     ; encoding: [0x04,0x00,0x3c,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b8 v1, a2 offset:4
+
+// GFX90A: ds_write_b8 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x3d,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b8 v1, a2 offset:65535 gds
+
+// GFX90A: ds_write_b16 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x3e,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b16 v1, a2 offset:65535
+
+// GFX90A: ds_write_b16 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x3e,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b16 v255, a2 offset:65535
+
+// GFX90A: ds_write_b16 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x3e,0xda,0x01,0xff,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b16 v1, a255 offset:65535
+
+// GFX90A: ds_write_b16 v1, a2             ; encoding: [0x00,0x00,0x3e,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b16 v1, a2
+
+// GFX90A: ds_write_b16 v1, a2             ; encoding: [0x00,0x00,0x3e,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b16 v1, a2
+
+// GFX90A: ds_write_b16 v1, a2 offset:4    ; encoding: [0x04,0x00,0x3e,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b16 v1, a2 offset:4
+
+// GFX90A: ds_write_b16 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x3f,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b16 v1, a2 offset:65535 gds
+
+// GFX90A: ds_add_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x40,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_u32 a5, v1, a2 offset:65535
+
+// GFX90A: ds_add_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x40,0xda,0x01,0x02,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_u32 a255, v1, a2 offset:65535
+
+// GFX90A: ds_add_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x40,0xda,0xff,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_u32 a5, v255, a2 offset:65535
+
+// GFX90A: ds_add_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x40,0xda,0x01,0xff,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_u32 a5, v1, a255 offset:65535
+
+// GFX90A: ds_add_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x40,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_u32 a5, v1, a2
+
+// GFX90A: ds_add_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x40,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_u32 a5, v1, a2
+
+// GFX90A: ds_add_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x40,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_u32 a5, v1, a2 offset:4
+
+// GFX90A: ds_add_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x41,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_u32 a5, v1, a2 offset:65535 gds
+
+// GFX90A: ds_sub_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x42,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_rtn_u32 a5, v1, a2 offset:65535
+
+// GFX90A: ds_sub_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x42,0xda,0x01,0x02,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_rtn_u32 a255, v1, a2 offset:65535
+
+// GFX90A: ds_sub_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x42,0xda,0xff,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_rtn_u32 a5, v255, a2 offset:65535
+
+// GFX90A: ds_sub_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x42,0xda,0x01,0xff,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_rtn_u32 a5, v1, a255 offset:65535
+
+// GFX90A: ds_sub_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x42,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_rtn_u32 a5, v1, a2
+
+// GFX90A: ds_sub_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x42,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_rtn_u32 a5, v1, a2
+
+// GFX90A: ds_sub_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x42,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_rtn_u32 a5, v1, a2 offset:4
+
+// GFX90A: ds_sub_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x43,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_rtn_u32 a5, v1, a2 offset:65535 gds
+
+// GFX90A: ds_rsub_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x44,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_rtn_u32 a5, v1, a2 offset:65535
+
+// GFX90A: ds_rsub_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x44,0xda,0x01,0x02,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_rtn_u32 a255, v1, a2 offset:65535
+
+// GFX90A: ds_rsub_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x44,0xda,0xff,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_rtn_u32 a5, v255, a2 offset:65535
+
+// GFX90A: ds_rsub_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x44,0xda,0x01,0xff,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_rtn_u32 a5, v1, a255 offset:65535
+
+// GFX90A: ds_rsub_rtn_u32 a5, v1, a2      ; encoding: [0x00,0x00,0x44,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_rtn_u32 a5, v1, a2
+
+// GFX90A: ds_rsub_rtn_u32 a5, v1, a2      ; encoding: [0x00,0x00,0x44,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_rtn_u32 a5, v1, a2
+
+// GFX90A: ds_rsub_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x44,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_rtn_u32 a5, v1, a2 offset:4
+
+// GFX90A: ds_rsub_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x45,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_rtn_u32 a5, v1, a2 offset:65535 gds
+
+// GFX90A: ds_inc_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x46,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_rtn_u32 a5, v1, a2 offset:65535
+
+// GFX90A: ds_inc_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x46,0xda,0x01,0x02,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_rtn_u32 a255, v1, a2 offset:65535
+
+// GFX90A: ds_inc_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x46,0xda,0xff,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_rtn_u32 a5, v255, a2 offset:65535
+
+// GFX90A: ds_inc_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x46,0xda,0x01,0xff,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_rtn_u32 a5, v1, a255 offset:65535
+
+// GFX90A: ds_inc_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x46,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_rtn_u32 a5, v1, a2
+
+// GFX90A: ds_inc_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x46,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_rtn_u32 a5, v1, a2
+
+// GFX90A: ds_inc_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x46,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_rtn_u32 a5, v1, a2 offset:4
+
+// GFX90A: ds_inc_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x47,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_rtn_u32 a5, v1, a2 offset:65535 gds
+
+// GFX90A: ds_dec_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x48,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_rtn_u32 a5, v1, a2 offset:65535
+
+// GFX90A: ds_dec_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x48,0xda,0x01,0x02,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_rtn_u32 a255, v1, a2 offset:65535
+
+// GFX90A: ds_dec_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x48,0xda,0xff,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_rtn_u32 a5, v255, a2 offset:65535
+
+// GFX90A: ds_dec_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x48,0xda,0x01,0xff,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_rtn_u32 a5, v1, a255 offset:65535
+
+// GFX90A: ds_dec_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x48,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_rtn_u32 a5, v1, a2
+
+// GFX90A: ds_dec_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x48,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_rtn_u32 a5, v1, a2
+
+// GFX90A: ds_dec_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x48,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_rtn_u32 a5, v1, a2 offset:4
+
+// GFX90A: ds_dec_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x49,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_rtn_u32 a5, v1, a2 offset:65535 gds
+
+// GFX90A: ds_min_rtn_i32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4a,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_i32 a5, v1, a2 offset:65535
+
+// GFX90A: ds_min_rtn_i32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4a,0xda,0x01,0x02,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_i32 a255, v1, a2 offset:65535
+
+// GFX90A: ds_min_rtn_i32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x4a,0xda,0xff,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_i32 a5, v255, a2 offset:65535
+
+// GFX90A: ds_min_rtn_i32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x4a,0xda,0x01,0xff,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_i32 a5, v1, a255 offset:65535
+
+// GFX90A: ds_min_rtn_i32 a5, v1, a2       ; encoding: [0x00,0x00,0x4a,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_i32 a5, v1, a2
+
+// GFX90A: ds_min_rtn_i32 a5, v1, a2       ; encoding: [0x00,0x00,0x4a,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_i32 a5, v1, a2
+
+// GFX90A: ds_min_rtn_i32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x4a,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_i32 a5, v1, a2 offset:4
+
+// GFX90A: ds_min_rtn_i32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x4b,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_i32 a5, v1, a2 offset:65535 gds
+
+// GFX90A: ds_max_rtn_i32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4c,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_i32 a5, v1, a2 offset:65535
+
+// GFX90A: ds_max_rtn_i32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4c,0xda,0x01,0x02,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_i32 a255, v1, a2 offset:65535
+
+// GFX90A: ds_max_rtn_i32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x4c,0xda,0xff,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_i32 a5, v255, a2 offset:65535
+
+// GFX90A: ds_max_rtn_i32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x4c,0xda,0x01,0xff,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_i32 a5, v1, a255 offset:65535
+
+// GFX90A: ds_max_rtn_i32 a5, v1, a2       ; encoding: [0x00,0x00,0x4c,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_i32 a5, v1, a2
+
+// GFX90A: ds_max_rtn_i32 a5, v1, a2       ; encoding: [0x00,0x00,0x4c,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_i32 a5, v1, a2
+
+// GFX90A: ds_max_rtn_i32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x4c,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_i32 a5, v1, a2 offset:4
+
+// GFX90A: ds_max_rtn_i32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x4d,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_i32 a5, v1, a2 offset:65535 gds
+
+// GFX90A: ds_min_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4e,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_u32 a5, v1, a2 offset:65535
+
+// GFX90A: ds_min_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4e,0xda,0x01,0x02,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_u32 a255, v1, a2 offset:65535
+
+// GFX90A: ds_min_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x4e,0xda,0xff,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_u32 a5, v255, a2 offset:65535
+
+// GFX90A: ds_min_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x4e,0xda,0x01,0xff,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_u32 a5, v1, a255 offset:65535
+
+// GFX90A: ds_min_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x4e,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_u32 a5, v1, a2
+
+// GFX90A: ds_min_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x4e,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_u32 a5, v1, a2
+
+// GFX90A: ds_min_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x4e,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_u32 a5, v1, a2 offset:4
+
+// GFX90A: ds_min_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x4f,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_u32 a5, v1, a2 offset:65535 gds
+
+// GFX90A: ds_max_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x50,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_u32 a5, v1, a2 offset:65535
+
+// GFX90A: ds_max_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x50,0xda,0x01,0x02,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_u32 a255, v1, a2 offset:65535
+
+// GFX90A: ds_max_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x50,0xda,0xff,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_u32 a5, v255, a2 offset:65535
+
+// GFX90A: ds_max_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x50,0xda,0x01,0xff,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_u32 a5, v1, a255 offset:65535
+
+// GFX90A: ds_max_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x50,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_u32 a5, v1, a2
+
+// GFX90A: ds_max_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x50,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_u32 a5, v1, a2
+
+// GFX90A: ds_max_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x50,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_u32 a5, v1, a2 offset:4
+
+// GFX90A: ds_max_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x51,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_u32 a5, v1, a2 offset:65535 gds
+
+// GFX90A: ds_and_rtn_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x52,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_rtn_b32 a5, v1, a2 offset:65535
+
+// GFX90A: ds_and_rtn_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x52,0xda,0x01,0x02,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_rtn_b32 a255, v1, a2 offset:65535
+
+// GFX90A: ds_and_rtn_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x52,0xda,0xff,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_rtn_b32 a5, v255, a2 offset:65535
+
+// GFX90A: ds_and_rtn_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x52,0xda,0x01,0xff,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_rtn_b32 a5, v1, a255 offset:65535
+
+// GFX90A: ds_and_rtn_b32 a5, v1, a2       ; encoding: [0x00,0x00,0x52,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_rtn_b32 a5, v1, a2
+
+// GFX90A: ds_and_rtn_b32 a5, v1, a2       ; encoding: [0x00,0x00,0x52,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_rtn_b32 a5, v1, a2
+
+// GFX90A: ds_and_rtn_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x52,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_rtn_b32 a5, v1, a2 offset:4
+
+// GFX90A: ds_and_rtn_b32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x53,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_rtn_b32 a5, v1, a2 offset:65535 gds
+
+// GFX90A: ds_or_rtn_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x54,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_rtn_b32 a5, v1, a2 offset:65535
+
+// GFX90A: ds_or_rtn_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x54,0xda,0x01,0x02,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_rtn_b32 a255, v1, a2 offset:65535
+
+// GFX90A: ds_or_rtn_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x54,0xda,0xff,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_rtn_b32 a5, v255, a2 offset:65535
+
+// GFX90A: ds_or_rtn_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x54,0xda,0x01,0xff,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_rtn_b32 a5, v1, a255 offset:65535
+
+// GFX90A: ds_or_rtn_b32 a5, v1, a2        ; encoding: [0x00,0x00,0x54,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_rtn_b32 a5, v1, a2
+
+// GFX90A: ds_or_rtn_b32 a5, v1, a2        ; encoding: [0x00,0x00,0x54,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_rtn_b32 a5, v1, a2
+
+// GFX90A: ds_or_rtn_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x54,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_rtn_b32 a5, v1, a2 offset:4
+
+// GFX90A: ds_or_rtn_b32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x55,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_rtn_b32 a5, v1, a2 offset:65535 gds
+
+// GFX90A: ds_xor_rtn_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x56,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_rtn_b32 a5, v1, a2 offset:65535
+
+// GFX90A: ds_xor_rtn_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x56,0xda,0x01,0x02,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_rtn_b32 a255, v1, a2 offset:65535
+
+// GFX90A: ds_xor_rtn_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x56,0xda,0xff,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_rtn_b32 a5, v255, a2 offset:65535
+
+// GFX90A: ds_xor_rtn_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x56,0xda,0x01,0xff,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_rtn_b32 a5, v1, a255 offset:65535
+
+// GFX90A: ds_xor_rtn_b32 a5, v1, a2       ; encoding: [0x00,0x00,0x56,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_rtn_b32 a5, v1, a2
+
+// GFX90A: ds_xor_rtn_b32 a5, v1, a2       ; encoding: [0x00,0x00,0x56,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_rtn_b32 a5, v1, a2
+
+// GFX90A: ds_xor_rtn_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x56,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_rtn_b32 a5, v1, a2 offset:4
+
+// GFX90A: ds_xor_rtn_b32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x57,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_rtn_b32 a5, v1, a2 offset:65535 gds
+
+// GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x58,0xda,0x01,0x02,0x05,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_rtn_b32 a5, v1, a2, a5 offset:65535
+
+// GFX90A: ds_mskor_rtn_b32 a255, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x58,0xda,0x01,0x02,0x05,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_rtn_b32 a255, v1, a2, a5 offset:65535
+
+// GFX90A: ds_mskor_rtn_b32 a5, v255, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x58,0xda,0xff,0x02,0x05,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_rtn_b32 a5, v255, a2, a5 offset:65535
+
+// GFX90A: ds_mskor_rtn_b32 a5, v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x58,0xda,0x01,0xff,0x03,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_rtn_b32 a5, v1, a255, a3 offset:65535
+
+// GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x58,0xda,0x01,0x02,0x05,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_rtn_b32 a5, v1, a2, a5 offset:65535
+
+// GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 ; encoding: [0x00,0x00,0x58,0xda,0x01,0x02,0x05,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_rtn_b32 a5, v1, a2, a5
+
+// GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 ; encoding: [0x00,0x00,0x58,0xda,0x01,0x02,0x05,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_rtn_b32 a5, v1, a2, a5
+
+// GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 offset:4 ; encoding: [0x04,0x00,0x58,0xda,0x01,0x02,0x05,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_rtn_b32 a5, v1, a2, a5 offset:4
+
+// GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 offset:65535 gds ; encoding: [0xff,0xff,0x59,0xda,0x01,0x02,0x05,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_rtn_b32 a5, v1, a2, a5 offset:65535 gds
+
+// GFX90A: ds_wrxchg_rtn_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x5a,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg_rtn_b32 a5, v1, a2 offset:65535
+
+// GFX90A: ds_wrxchg_rtn_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x5a,0xda,0x01,0x02,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg_rtn_b32 a255, v1, a2 offset:65535
+
+// GFX90A: ds_wrxchg_rtn_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x5a,0xda,0xff,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg_rtn_b32 a5, v255, a2 offset:65535
+
+// GFX90A: ds_wrxchg_rtn_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x5a,0xda,0x01,0xff,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg_rtn_b32 a5, v1, a255 offset:65535
+
+// GFX90A: ds_wrxchg_rtn_b32 a5, v1, a2    ; encoding: [0x00,0x00,0x5a,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg_rtn_b32 a5, v1, a2
+
+// GFX90A: ds_wrxchg_rtn_b32 a5, v1, a2    ; encoding: [0x00,0x00,0x5a,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg_rtn_b32 a5, v1, a2
+
+// GFX90A: ds_wrxchg_rtn_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x5a,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg_rtn_b32 a5, v1, a2 offset:4
+
+// GFX90A: ds_wrxchg_rtn_b32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x5b,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg_rtn_b32 a5, v1, a2 offset:65535 gds
+
+// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5c,0xda,0x01,0x02,0x03,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255
+
+// GFX90A: ds_wrxchg2_rtn_b32 a[254:255], v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5c,0xda,0x01,0x02,0x03,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b32 a[254:255], v1, a2, a3 offset0:127 offset1:255
+
+// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v255, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5c,0xda,0xff,0x02,0x03,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b32 a[6:7], v255, a2, a3 offset0:127 offset1:255
+
+// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a255, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5c,0xda,0x01,0xff,0x03,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b32 a[6:7], v1, a255, a3 offset0:127 offset1:255
+
+// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5c,0xda,0x01,0x02,0xff,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a255 offset0:127 offset1:255
+
+// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x5c,0xda,0x01,0x02,0x03,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset1:255
+
+// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x5c,0xda,0x01,0x02,0x03,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset1:255
+
+// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x5c,0xda,0x01,0x02,0x03,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:16 offset1:255
+
+// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x5c,0xda,0x01,0x02,0x03,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127
+
+// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x5c,0xda,0x01,0x02,0x03,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127
+
+// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x5c,0xda,0x01,0x02,0x03,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:1
+
+// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x5d,0xda,0x01,0x02,0x03,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 gds
+
+// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5e,0xda,0x01,0x02,0x03,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255
+
+// GFX90A: ds_wrxchg2st64_rtn_b32 a[254:255], v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5e,0xda,0x01,0x02,0x03,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b32 a[254:255], v1, a2, a3 offset0:127 offset1:255
+
+// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v255, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5e,0xda,0xff,0x02,0x03,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b32 a[6:7], v255, a2, a3 offset0:127 offset1:255
+
+// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a255, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5e,0xda,0x01,0xff,0x03,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b32 a[6:7], v1, a255, a3 offset0:127 offset1:255
+
+// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5e,0xda,0x01,0x02,0xff,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a255 offset0:127 offset1:255
+
+// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x5e,0xda,0x01,0x02,0x03,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset1:255
+
+// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x5e,0xda,0x01,0x02,0x03,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset1:255
+
+// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x5e,0xda,0x01,0x02,0x03,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:16 offset1:255
+
+// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x5e,0xda,0x01,0x02,0x03,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127
+
+// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x5e,0xda,0x01,0x02,0x03,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127
+
+// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x5e,0xda,0x01,0x02,0x03,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:1
+
+// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x5f,0xda,0x01,0x02,0x03,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 gds
+
+// GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0x01,0x02,0x03,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_b32 a5, v1, a2, a3 offset:65535
+
+// GFX90A: ds_cmpst_rtn_b32 a255, v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0x01,0x02,0x03,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_b32 a255, v1, a2, a3 offset:65535
+
+// GFX90A: ds_cmpst_rtn_b32 a5, v255, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0xff,0x02,0x03,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_b32 a5, v255, a2, a3 offset:65535
+
+// GFX90A: ds_cmpst_rtn_b32 a5, v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0x01,0xff,0x03,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_b32 a5, v1, a255, a3 offset:65535
+
+// GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a255 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0x01,0x02,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_b32 a5, v1, a2, a255 offset:65535
+
+// GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a3 ; encoding: [0x00,0x00,0x60,0xda,0x01,0x02,0x03,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_b32 a5, v1, a2, a3
+
+// GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a3 ; encoding: [0x00,0x00,0x60,0xda,0x01,0x02,0x03,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_b32 a5, v1, a2, a3
+
+// GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a3 offset:4 ; encoding: [0x04,0x00,0x60,0xda,0x01,0x02,0x03,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_b32 a5, v1, a2, a3 offset:4
+
+// GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a3 offset:65535 gds ; encoding: [0xff,0xff,0x61,0xda,0x01,0x02,0x03,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_b32 a5, v1, a2, a3 offset:65535 gds
+
+// GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x62,0xda,0x01,0x02,0x03,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_f32 a5, v1, a2, a3 offset:65535
+
+// GFX90A: ds_cmpst_rtn_f32 a255, v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x62,0xda,0x01,0x02,0x03,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_f32 a255, v1, a2, a3 offset:65535
+
+// GFX90A: ds_cmpst_rtn_f32 a5, v255, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x62,0xda,0xff,0x02,0x03,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_f32 a5, v255, a2, a3 offset:65535
+
+// GFX90A: ds_cmpst_rtn_f32 a5, v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x62,0xda,0x01,0xff,0x03,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_f32 a5, v1, a255, a3 offset:65535
+
+// GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a255 offset:65535 ; encoding: [0xff,0xff,0x62,0xda,0x01,0x02,0xff,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_f32 a5, v1, a2, a255 offset:65535
+
+// GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a3 ; encoding: [0x00,0x00,0x62,0xda,0x01,0x02,0x03,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_f32 a5, v1, a2, a3
+
+// GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a3 ; encoding: [0x00,0x00,0x62,0xda,0x01,0x02,0x03,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_f32 a5, v1, a2, a3
+
+// GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a3 offset:4 ; encoding: [0x04,0x00,0x62,0xda,0x01,0x02,0x03,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_f32 a5, v1, a2, a3 offset:4
+
+// GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a3 offset:65535 gds ; encoding: [0xff,0xff,0x63,0xda,0x01,0x02,0x03,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_f32 a5, v1, a2, a3 offset:65535 gds
+
+// GFX90A: ds_min_rtn_f32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x64,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_f32 a5, v1, a2 offset:65535
+
+// GFX90A: ds_min_rtn_f32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x64,0xda,0x01,0x02,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_f32 a255, v1, a2 offset:65535
+
+// GFX90A: ds_min_rtn_f32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x64,0xda,0xff,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_f32 a5, v255, a2 offset:65535
+
+// GFX90A: ds_min_rtn_f32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x64,0xda,0x01,0xff,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_f32 a5, v1, a255 offset:65535
+
+// GFX90A: ds_min_rtn_f32 a5, v1, a2       ; encoding: [0x00,0x00,0x64,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_f32 a5, v1, a2
+
+// GFX90A: ds_min_rtn_f32 a5, v1, a2       ; encoding: [0x00,0x00,0x64,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_f32 a5, v1, a2
+
+// GFX90A: ds_min_rtn_f32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x64,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_f32 a5, v1, a2 offset:4
+
+// GFX90A: ds_min_rtn_f32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x65,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_f32 a5, v1, a2 offset:65535 gds
+
+// GFX90A: ds_max_rtn_f32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x66,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_f32 a5, v1, a2 offset:65535
+
+// GFX90A: ds_max_rtn_f32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x66,0xda,0x01,0x02,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_f32 a255, v1, a2 offset:65535
+
+// GFX90A: ds_max_rtn_f32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x66,0xda,0xff,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_f32 a5, v255, a2 offset:65535
+
+// GFX90A: ds_max_rtn_f32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x66,0xda,0x01,0xff,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_f32 a5, v1, a255 offset:65535
+
+// GFX90A: ds_max_rtn_f32 a5, v1, a2       ; encoding: [0x00,0x00,0x66,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_f32 a5, v1, a2
+
+// GFX90A: ds_max_rtn_f32 a5, v1, a2       ; encoding: [0x00,0x00,0x66,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_f32 a5, v1, a2
+
+// GFX90A: ds_max_rtn_f32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x66,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_f32 a5, v1, a2 offset:4
+
+// GFX90A: ds_max_rtn_f32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x67,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_f32 a5, v1, a2 offset:65535 gds
+
+// GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0x01,0x02,0x05,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrap_rtn_b32 a5, v1, a2, a5 offset:65535
+
+// GFX90A: ds_wrap_rtn_b32 a255, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0x01,0x02,0x05,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrap_rtn_b32 a255, v1, a2, a5 offset:65535
+
+// GFX90A: ds_wrap_rtn_b32 a5, v255, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0xff,0x02,0x05,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrap_rtn_b32 a5, v255, a2, a5 offset:65535
+
+// GFX90A: ds_wrap_rtn_b32 a5, v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0x01,0xff,0x03,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrap_rtn_b32 a5, v1, a255, a3 offset:65535
+
+// GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0x01,0x02,0x05,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrap_rtn_b32 a5, v1, a2, a5 offset:65535
+
+// GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5  ; encoding: [0x00,0x00,0x68,0xda,0x01,0x02,0x05,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrap_rtn_b32 a5, v1, a2, a5
+
+// GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5  ; encoding: [0x00,0x00,0x68,0xda,0x01,0x02,0x05,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrap_rtn_b32 a5, v1, a2, a5
+
+// GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5 offset:4 ; encoding: [0x04,0x00,0x68,0xda,0x01,0x02,0x05,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrap_rtn_b32 a5, v1, a2, a5 offset:4
+
+// GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5 offset:65535 gds ; encoding: [0xff,0xff,0x69,0xda,0x01,0x02,0x05,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrap_rtn_b32 a5, v1, a2, a5 offset:65535 gds
+
+// GFX90A: ds_add_rtn_f32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x6a,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_f32 a5, v1, a2 offset:65535
+
+// GFX90A: ds_add_rtn_f32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x6a,0xda,0x01,0x02,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_f32 a255, v1, a2 offset:65535
+
+// GFX90A: ds_add_rtn_f32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x6a,0xda,0xff,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_f32 a5, v255, a2 offset:65535
+
+// GFX90A: ds_add_rtn_f32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x6a,0xda,0x01,0xff,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_f32 a5, v1, a255 offset:65535
+
+// GFX90A: ds_add_rtn_f32 a5, v1, a2       ; encoding: [0x00,0x00,0x6a,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_f32 a5, v1, a2
+
+// GFX90A: ds_add_rtn_f32 a5, v1, a2       ; encoding: [0x00,0x00,0x6a,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_f32 a5, v1, a2
+
+// GFX90A: ds_add_rtn_f32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x6a,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_f32 a5, v1, a2 offset:4
+
+// GFX90A: ds_add_rtn_f32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x6b,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_f32 a5, v1, a2 offset:65535 gds
+
+// GFX90A: ds_read_b32 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x6c,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b32 a5, v1 offset:65535
+
+// GFX90A: ds_read_b32 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x6c,0xda,0x01,0x00,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b32 a255, v1 offset:65535
+
+// GFX90A: ds_read_b32 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x6c,0xda,0xff,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b32 a5, v255 offset:65535
+
+// GFX90A: ds_read_b32 a5, v1              ; encoding: [0x00,0x00,0x6c,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b32 a5, v1
+
+// GFX90A: ds_read_b32 a5, v1              ; encoding: [0x00,0x00,0x6c,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b32 a5, v1
+
+// GFX90A: ds_read_b32 a5, v1 offset:4     ; encoding: [0x04,0x00,0x6c,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b32 a5, v1 offset:4
+
+// GFX90A: ds_read_b32 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x6d,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b32 a5, v1 offset:65535 gds
+
+// GFX90A: ds_read2_b32 a[6:7], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x6e,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2_b32 a[6:7], v1 offset0:127 offset1:255
+
+// GFX90A: ds_read2_b32 a[254:255], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x6e,0xda,0x01,0x00,0x00,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2_b32 a[254:255], v1 offset0:127 offset1:255
+
+// GFX90A: ds_read2_b32 a[6:7], v255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x6e,0xda,0xff,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2_b32 a[6:7], v255 offset0:127 offset1:255
+
+// GFX90A: ds_read2_b32 a[6:7], v1 offset1:255 ; encoding: [0x00,0xff,0x6e,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2_b32 a[6:7], v1 offset1:255
+
+// GFX90A: ds_read2_b32 a[6:7], v1 offset1:255 ; encoding: [0x00,0xff,0x6e,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2_b32 a[6:7], v1 offset1:255
+
+// GFX90A: ds_read2_b32 a[6:7], v1 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x6e,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2_b32 a[6:7], v1 offset0:16 offset1:255
+
+// GFX90A: ds_read2_b32 a[6:7], v1 offset0:127 ; encoding: [0x7f,0x00,0x6e,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2_b32 a[6:7], v1 offset0:127
+
+// GFX90A: ds_read2_b32 a[6:7], v1 offset0:127 ; encoding: [0x7f,0x00,0x6e,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2_b32 a[6:7], v1 offset0:127
+
+// GFX90A: ds_read2_b32 a[6:7], v1 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x6e,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2_b32 a[6:7], v1 offset0:127 offset1:1
+
+// GFX90A: ds_read2_b32 a[6:7], v1 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x6f,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2_b32 a[6:7], v1 offset0:127 offset1:255 gds
+
+// GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x70,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2st64_b32 a[6:7], v1 offset0:127 offset1:255
+
+// GFX90A: ds_read2st64_b32 a[254:255], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x70,0xda,0x01,0x00,0x00,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2st64_b32 a[254:255], v1 offset0:127 offset1:255
+
+// GFX90A: ds_read2st64_b32 a[6:7], v255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x70,0xda,0xff,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2st64_b32 a[6:7], v255 offset0:127 offset1:255
+
+// GFX90A: ds_read2st64_b32 a[6:7], v1 offset1:255 ; encoding: [0x00,0xff,0x70,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2st64_b32 a[6:7], v1 offset1:255
+
+// GFX90A: ds_read2st64_b32 a[6:7], v1 offset1:255 ; encoding: [0x00,0xff,0x70,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2st64_b32 a[6:7], v1 offset1:255
+
+// GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x70,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2st64_b32 a[6:7], v1 offset0:16 offset1:255
+
+// GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:127 ; encoding: [0x7f,0x00,0x70,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2st64_b32 a[6:7], v1 offset0:127
+
+// GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:127 ; encoding: [0x7f,0x00,0x70,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2st64_b32 a[6:7], v1 offset0:127
+
+// GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x70,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2st64_b32 a[6:7], v1 offset0:127 offset1:1
+
+// GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x71,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2st64_b32 a[6:7], v1 offset0:127 offset1:255 gds
+
+// GFX90A: ds_read_i8 a5, v1 offset:65535  ; encoding: [0xff,0xff,0x72,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i8 a5, v1 offset:65535
+
+// GFX90A: ds_read_i8 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x72,0xda,0x01,0x00,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i8 a255, v1 offset:65535
+
+// GFX90A: ds_read_i8 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x72,0xda,0xff,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i8 a5, v255 offset:65535
+
+// GFX90A: ds_read_i8 a5, v1               ; encoding: [0x00,0x00,0x72,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i8 a5, v1
+
+// GFX90A: ds_read_i8 a5, v1               ; encoding: [0x00,0x00,0x72,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i8 a5, v1
+
+// GFX90A: ds_read_i8 a5, v1 offset:4      ; encoding: [0x04,0x00,0x72,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i8 a5, v1 offset:4
+
+// GFX90A: ds_read_i8 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x73,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i8 a5, v1 offset:65535 gds
+
+// GFX90A: ds_read_u8 a5, v1 offset:65535  ; encoding: [0xff,0xff,0x74,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u8 a5, v1 offset:65535
+
+// GFX90A: ds_read_u8 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x74,0xda,0x01,0x00,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u8 a255, v1 offset:65535
+
+// GFX90A: ds_read_u8 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x74,0xda,0xff,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u8 a5, v255 offset:65535
+
+// GFX90A: ds_read_u8 a5, v1               ; encoding: [0x00,0x00,0x74,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u8 a5, v1
+
+// GFX90A: ds_read_u8 a5, v1               ; encoding: [0x00,0x00,0x74,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u8 a5, v1
+
+// GFX90A: ds_read_u8 a5, v1 offset:4      ; encoding: [0x04,0x00,0x74,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u8 a5, v1 offset:4
+
+// GFX90A: ds_read_u8 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x75,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u8 a5, v1 offset:65535 gds
+
+// GFX90A: ds_read_i16 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x76,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i16 a5, v1 offset:65535
+
+// GFX90A: ds_read_i16 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x76,0xda,0x01,0x00,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i16 a255, v1 offset:65535
+
+// GFX90A: ds_read_i16 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x76,0xda,0xff,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i16 a5, v255 offset:65535
+
+// GFX90A: ds_read_i16 a5, v1              ; encoding: [0x00,0x00,0x76,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i16 a5, v1
+
+// GFX90A: ds_read_i16 a5, v1              ; encoding: [0x00,0x00,0x76,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i16 a5, v1
+
+// GFX90A: ds_read_i16 a5, v1 offset:4     ; encoding: [0x04,0x00,0x76,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i16 a5, v1 offset:4
+
+// GFX90A: ds_read_i16 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x77,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i16 a5, v1 offset:65535 gds
+
+// GFX90A: ds_read_u16 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x78,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u16 a5, v1 offset:65535
+
+// GFX90A: ds_read_u16 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x78,0xda,0x01,0x00,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u16 a255, v1 offset:65535
+
+// GFX90A: ds_read_u16 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x78,0xda,0xff,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u16 a5, v255 offset:65535
+
+// GFX90A: ds_read_u16 a5, v1              ; encoding: [0x00,0x00,0x78,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u16 a5, v1
+
+// GFX90A: ds_read_u16 a5, v1              ; encoding: [0x00,0x00,0x78,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u16 a5, v1
+
+// GFX90A: ds_read_u16 a5, v1 offset:4     ; encoding: [0x04,0x00,0x78,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u16 a5, v1 offset:4
+
+// GFX90A: ds_read_u16 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x79,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u16 a5, v1 offset:65535 gds
+
+// GFX90A: ds_swizzle_b32 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_swizzle_b32 a5, v1 offset:65535
+
+// GFX90A: ds_swizzle_b32 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_swizzle_b32 a255, v1 offset:65535
+
+// GFX90A: ds_swizzle_b32 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0xff,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_swizzle_b32 a5, v255 offset:65535
+
+// GFX90A: ds_swizzle_b32 a5, v1           ; encoding: [0x00,0x00,0x7a,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_swizzle_b32 a5, v1
+
+// GFX90A: ds_swizzle_b32 a5, v1           ; encoding: [0x00,0x00,0x7a,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_swizzle_b32 a5, v1
+
+// GFX90A: ds_swizzle_b32 a5, v1 offset:swizzle(BITMASK_PERM,"00p00") ; encoding: [0x04,0x00,0x7a,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_swizzle_b32 a5, v1 offset:swizzle(BITMASK_PERM,"00p00")
+
+// GFX90A: ds_swizzle_b32 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x7b,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_swizzle_b32 a5, v1 offset:65535 gds
+
+// GFX90A: ds_permute_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_permute_b32 a5, v1, a2 offset:65535
+
+// GFX90A: ds_permute_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0x01,0x02,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_permute_b32 a255, v1, a2 offset:65535
+
+// GFX90A: ds_permute_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0xff,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_permute_b32 a5, v255, a2 offset:65535
+
+// GFX90A: ds_permute_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0x01,0xff,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_permute_b32 a5, v1, a255 offset:65535
+
+// GFX90A: ds_permute_b32 a5, v1, a2       ; encoding: [0x00,0x00,0x7c,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_permute_b32 a5, v1, a2
+
+// GFX90A: ds_permute_b32 a5, v1, a2       ; encoding: [0x00,0x00,0x7c,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_permute_b32 a5, v1, a2
+
+// GFX90A: ds_permute_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x7c,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_permute_b32 a5, v1, a2 offset:4
+
+// GFX90A: ds_bpermute_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_bpermute_b32 a5, v1, a2 offset:65535
+
+// GFX90A: ds_bpermute_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0x01,0x02,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_bpermute_b32 a255, v1, a2 offset:65535
+
+// GFX90A: ds_bpermute_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0xff,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_bpermute_b32 a5, v255, a2 offset:65535
+
+// GFX90A: ds_bpermute_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0x01,0xff,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_bpermute_b32 a5, v1, a255 offset:65535
+
+// GFX90A: ds_bpermute_b32 a5, v1, a2      ; encoding: [0x00,0x00,0x7e,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_bpermute_b32 a5, v1, a2
+
+// GFX90A: ds_bpermute_b32 a5, v1, a2      ; encoding: [0x00,0x00,0x7e,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_bpermute_b32 a5, v1, a2
+
+// GFX90A: ds_bpermute_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x7e,0xda,0x01,0x02,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_bpermute_b32 a5, v1, a2 offset:4
+
+// GFX90A: ds_add_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x80,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_u64 v1, a[2:3] offset:65535
+
+// GFX90A: ds_add_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x80,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_u64 v255, a[2:3] offset:65535
+
+// GFX90A: ds_add_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x80,0xda,0x01,0xfe,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_u64 v1, a[254:255] offset:65535
+
+// GFX90A: ds_add_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x80,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_u64 v1, a[2:3]
+
+// GFX90A: ds_add_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x80,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_u64 v1, a[2:3]
+
+// GFX90A: ds_add_u64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0x80,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_u64 v1, a[2:3] offset:4
+
+// GFX90A: ds_add_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x81,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_u64 v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_sub_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x82,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_u64 v1, a[2:3] offset:65535
+
+// GFX90A: ds_sub_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x82,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_u64 v255, a[2:3] offset:65535
+
+// GFX90A: ds_sub_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x82,0xda,0x01,0xfe,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_u64 v1, a[254:255] offset:65535
+
+// GFX90A: ds_sub_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x82,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_u64 v1, a[2:3]
+
+// GFX90A: ds_sub_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x82,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_u64 v1, a[2:3]
+
+// GFX90A: ds_sub_u64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0x82,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_u64 v1, a[2:3] offset:4
+
+// GFX90A: ds_sub_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x83,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_u64 v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_rsub_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x84,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_u64 v1, a[2:3] offset:65535
+
+// GFX90A: ds_rsub_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x84,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_u64 v255, a[2:3] offset:65535
+
+// GFX90A: ds_rsub_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x84,0xda,0x01,0xfe,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_u64 v1, a[254:255] offset:65535
+
+// GFX90A: ds_rsub_u64 v1, a[2:3]          ; encoding: [0x00,0x00,0x84,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_u64 v1, a[2:3]
+
+// GFX90A: ds_rsub_u64 v1, a[2:3]          ; encoding: [0x00,0x00,0x84,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_u64 v1, a[2:3]
+
+// GFX90A: ds_rsub_u64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x84,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_u64 v1, a[2:3] offset:4
+
+// GFX90A: ds_rsub_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x85,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_u64 v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_inc_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x86,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_u64 v1, a[2:3] offset:65535
+
+// GFX90A: ds_inc_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x86,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_u64 v255, a[2:3] offset:65535
+
+// GFX90A: ds_inc_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x86,0xda,0x01,0xfe,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_u64 v1, a[254:255] offset:65535
+
+// GFX90A: ds_inc_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x86,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_u64 v1, a[2:3]
+
+// GFX90A: ds_inc_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x86,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_u64 v1, a[2:3]
+
+// GFX90A: ds_inc_u64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0x86,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_u64 v1, a[2:3] offset:4
+
+// GFX90A: ds_inc_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x87,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_u64 v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_dec_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x88,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_u64 v1, a[2:3] offset:65535
+
+// GFX90A: ds_dec_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x88,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_u64 v255, a[2:3] offset:65535
+
+// GFX90A: ds_dec_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x88,0xda,0x01,0xfe,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_u64 v1, a[254:255] offset:65535
+
+// GFX90A: ds_dec_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x88,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_u64 v1, a[2:3]
+
+// GFX90A: ds_dec_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x88,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_u64 v1, a[2:3]
+
+// GFX90A: ds_dec_u64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0x88,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_u64 v1, a[2:3] offset:4
+
+// GFX90A: ds_dec_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x89,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_u64 v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_min_i64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8a,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_i64 v1, a[2:3] offset:65535
+
+// GFX90A: ds_min_i64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8a,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_i64 v255, a[2:3] offset:65535
+
+// GFX90A: ds_min_i64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x8a,0xda,0x01,0xfe,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_i64 v1, a[254:255] offset:65535
+
+// GFX90A: ds_min_i64 v1, a[2:3]           ; encoding: [0x00,0x00,0x8a,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_i64 v1, a[2:3]
+
+// GFX90A: ds_min_i64 v1, a[2:3]           ; encoding: [0x00,0x00,0x8a,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_i64 v1, a[2:3]
+
+// GFX90A: ds_min_i64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0x8a,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_i64 v1, a[2:3] offset:4
+
+// GFX90A: ds_min_i64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x8b,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_i64 v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_max_i64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8c,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_i64 v1, a[2:3] offset:65535
+
+// GFX90A: ds_max_i64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8c,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_i64 v255, a[2:3] offset:65535
+
+// GFX90A: ds_max_i64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x8c,0xda,0x01,0xfe,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_i64 v1, a[254:255] offset:65535
+
+// GFX90A: ds_max_i64 v1, a[2:3]           ; encoding: [0x00,0x00,0x8c,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_i64 v1, a[2:3]
+
+// GFX90A: ds_max_i64 v1, a[2:3]           ; encoding: [0x00,0x00,0x8c,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_i64 v1, a[2:3]
+
+// GFX90A: ds_max_i64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0x8c,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_i64 v1, a[2:3] offset:4
+
+// GFX90A: ds_max_i64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x8d,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_i64 v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_min_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8e,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_u64 v1, a[2:3] offset:65535
+
+// GFX90A: ds_min_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8e,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_u64 v255, a[2:3] offset:65535
+
+// GFX90A: ds_min_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x8e,0xda,0x01,0xfe,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_u64 v1, a[254:255] offset:65535
+
+// GFX90A: ds_min_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x8e,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_u64 v1, a[2:3]
+
+// GFX90A: ds_min_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x8e,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_u64 v1, a[2:3]
+
+// GFX90A: ds_min_u64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0x8e,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_u64 v1, a[2:3] offset:4
+
+// GFX90A: ds_min_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x8f,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_u64 v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_max_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x90,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_u64 v1, a[2:3] offset:65535
+
+// GFX90A: ds_max_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x90,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_u64 v255, a[2:3] offset:65535
+
+// GFX90A: ds_max_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x90,0xda,0x01,0xfe,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_u64 v1, a[254:255] offset:65535
+
+// GFX90A: ds_max_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x90,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_u64 v1, a[2:3]
+
+// GFX90A: ds_max_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x90,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_u64 v1, a[2:3]
+
+// GFX90A: ds_max_u64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0x90,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_u64 v1, a[2:3] offset:4
+
+// GFX90A: ds_max_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x91,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_u64 v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_and_b64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x92,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_b64 v1, a[2:3] offset:65535
+
+// GFX90A: ds_and_b64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x92,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_b64 v255, a[2:3] offset:65535
+
+// GFX90A: ds_and_b64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x92,0xda,0x01,0xfe,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_b64 v1, a[254:255] offset:65535
+
+// GFX90A: ds_and_b64 v1, a[2:3]           ; encoding: [0x00,0x00,0x92,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_b64 v1, a[2:3]
+
+// GFX90A: ds_and_b64 v1, a[2:3]           ; encoding: [0x00,0x00,0x92,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_b64 v1, a[2:3]
+
+// GFX90A: ds_and_b64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0x92,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_b64 v1, a[2:3] offset:4
+
+// GFX90A: ds_and_b64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x93,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_b64 v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_or_b64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x94,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_b64 v1, a[2:3] offset:65535
+
+// GFX90A: ds_or_b64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x94,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_b64 v255, a[2:3] offset:65535
+
+// GFX90A: ds_or_b64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x94,0xda,0x01,0xfe,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_b64 v1, a[254:255] offset:65535
+
+// GFX90A: ds_or_b64 v1, a[2:3]            ; encoding: [0x00,0x00,0x94,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_b64 v1, a[2:3]
+
+// GFX90A: ds_or_b64 v1, a[2:3]            ; encoding: [0x00,0x00,0x94,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_b64 v1, a[2:3]
+
+// GFX90A: ds_or_b64 v1, a[2:3] offset:4   ; encoding: [0x04,0x00,0x94,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_b64 v1, a[2:3] offset:4
+
+// GFX90A: ds_or_b64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x95,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_b64 v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_xor_b64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x96,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_b64 v1, a[2:3] offset:65535
+
+// GFX90A: ds_xor_b64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x96,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_b64 v255, a[2:3] offset:65535
+
+// GFX90A: ds_xor_b64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x96,0xda,0x01,0xfe,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_b64 v1, a[254:255] offset:65535
+
+// GFX90A: ds_xor_b64 v1, a[2:3]           ; encoding: [0x00,0x00,0x96,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_b64 v1, a[2:3]
+
+// GFX90A: ds_xor_b64 v1, a[2:3]           ; encoding: [0x00,0x00,0x96,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_b64 v1, a[2:3]
+
+// GFX90A: ds_xor_b64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0x96,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_b64 v1, a[2:3] offset:4
+
+// GFX90A: ds_xor_b64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x97,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_b64 v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_mskor_b64 v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0x98,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_b64 v1, a[2:3], a[4:5] offset:65535
+
+// GFX90A: ds_mskor_b64 v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0x98,0xda,0xff,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_b64 v255, a[2:3], a[4:5] offset:65535
+
+// GFX90A: ds_mskor_b64 v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0x98,0xda,0x01,0xfe,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_b64 v1, a[254:255], a[4:5] offset:65535
+
+// GFX90A: ds_mskor_b64 v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0x98,0xda,0x01,0x02,0xfe,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_b64 v1, a[2:3], a[254:255] offset:65535
+
+// GFX90A: ds_mskor_b64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0x98,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_b64 v1, a[2:3], a[4:5]
+
+// GFX90A: ds_mskor_b64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0x98,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_b64 v1, a[2:3], a[4:5]
+
+// GFX90A: ds_mskor_b64 v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0x98,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_b64 v1, a[2:3], a[4:5] offset:4
+
+// GFX90A: ds_mskor_b64 v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0x99,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_b64 v1, a[2:3], a[4:5] offset:65535 gds
+
+// GFX90A: ds_write_b64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x9a,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b64 v1, a[2:3] offset:65535
+
+// GFX90A: ds_write_b64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x9a,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b64 v255, a[2:3] offset:65535
+
+// GFX90A: ds_write_b64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x9a,0xda,0x01,0xfe,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b64 v1, a[254:255] offset:65535
+
+// GFX90A: ds_write_b64 v1, a[2:3]         ; encoding: [0x00,0x00,0x9a,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b64 v1, a[2:3]
+
+// GFX90A: ds_write_b64 v1, a[2:3]         ; encoding: [0x00,0x00,0x9a,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b64 v1, a[2:3]
+
+// GFX90A: ds_write_b64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x9a,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b64 v1, a[2:3] offset:4
+
+// GFX90A: ds_write_b64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x9b,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b64 v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9c,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255
+
+// GFX90A: ds_write2_b64 v255, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9c,0xda,0xff,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b64 v255, a[2:3], a[4:5] offset0:127 offset1:255
+
+// GFX90A: ds_write2_b64 v1, a[254:255], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9c,0xda,0x01,0xfe,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b64 v1, a[254:255], a[4:5] offset0:127 offset1:255
+
+// GFX90A: ds_write2_b64 v1, a[2:3], a[254:255] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9c,0xda,0x01,0x02,0xfe,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b64 v1, a[2:3], a[254:255] offset0:127 offset1:255
+
+// GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0x9c,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b64 v1, a[2:3], a[4:5] offset1:255
+
+// GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0x9c,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b64 v1, a[2:3], a[4:5] offset1:255
+
+// GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:16 offset1:255 ; encoding: [0x10,0xff,0x9c,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b64 v1, a[2:3], a[4:5] offset0:16 offset1:255
+
+// GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0x9c,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b64 v1, a[2:3], a[4:5] offset0:127
+
+// GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0x9c,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b64 v1, a[2:3], a[4:5] offset0:127
+
+// GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x9c,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 offset1:1
+
+// GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x9d,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 gds
+
+// GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9e,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255
+
+// GFX90A: ds_write2st64_b64 v255, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9e,0xda,0xff,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b64 v255, a[2:3], a[4:5] offset0:127 offset1:255
+
+// GFX90A: ds_write2st64_b64 v1, a[254:255], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9e,0xda,0x01,0xfe,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b64 v1, a[254:255], a[4:5] offset0:127 offset1:255
+
+// GFX90A: ds_write2st64_b64 v1, a[2:3], a[254:255] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9e,0xda,0x01,0x02,0xfe,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b64 v1, a[2:3], a[254:255] offset0:127 offset1:255
+
+// GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0x9e,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b64 v1, a[2:3], a[4:5] offset1:255
+
+// GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0x9e,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b64 v1, a[2:3], a[4:5] offset1:255
+
+// GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:16 offset1:255 ; encoding: [0x10,0xff,0x9e,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:16 offset1:255
+
+// GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0x9e,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127
+
+// GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0x9e,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127
+
+// GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x9e,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 offset1:1
+
+// GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x9f,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 gds
+
+// GFX90A: ds_cmpst_b64 v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_b64 v1, a[2:3], a[4:5] offset:65535
+
+// GFX90A: ds_cmpst_b64 v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0xff,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_b64 v255, a[2:3], a[4:5] offset:65535
+
+// GFX90A: ds_cmpst_b64 v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0x01,0xfe,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_b64 v1, a[254:255], a[4:5] offset:65535
+
+// GFX90A: ds_cmpst_b64 v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0x01,0x02,0xfe,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_b64 v1, a[2:3], a[254:255] offset:65535
+
+// GFX90A: ds_cmpst_b64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xa0,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_b64 v1, a[2:3], a[4:5]
+
+// GFX90A: ds_cmpst_b64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xa0,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_b64 v1, a[2:3], a[4:5]
+
+// GFX90A: ds_cmpst_b64 v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0xa0,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_b64 v1, a[2:3], a[4:5] offset:4
+
+// GFX90A: ds_cmpst_b64 v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0xa1,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_b64 v1, a[2:3], a[4:5] offset:65535 gds
+
+// GFX90A: ds_cmpst_f64 v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa2,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_f64 v1, a[2:3], a[4:5] offset:65535
+
+// GFX90A: ds_cmpst_f64 v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa2,0xda,0xff,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_f64 v255, a[2:3], a[4:5] offset:65535
+
+// GFX90A: ds_cmpst_f64 v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa2,0xda,0x01,0xfe,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_f64 v1, a[254:255], a[4:5] offset:65535
+
+// GFX90A: ds_cmpst_f64 v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0xa2,0xda,0x01,0x02,0xfe,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_f64 v1, a[2:3], a[254:255] offset:65535
+
+// GFX90A: ds_cmpst_f64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xa2,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_f64 v1, a[2:3], a[4:5]
+
+// GFX90A: ds_cmpst_f64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xa2,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_f64 v1, a[2:3], a[4:5]
+
+// GFX90A: ds_cmpst_f64 v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0xa2,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_f64 v1, a[2:3], a[4:5] offset:4
+
+// GFX90A: ds_cmpst_f64 v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0xa3,0xda,0x01,0x02,0x04,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_f64 v1, a[2:3], a[4:5] offset:65535 gds
+
+// GFX90A: ds_min_f64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xa4,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_f64 v1, a[2:3] offset:65535
+
+// GFX90A: ds_min_f64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xa4,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_f64 v255, a[2:3] offset:65535
+
+// GFX90A: ds_min_f64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xa4,0xda,0x01,0xfe,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_f64 v1, a[254:255] offset:65535
+
+// GFX90A: ds_min_f64 v1, a[2:3]           ; encoding: [0x00,0x00,0xa4,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_f64 v1, a[2:3]
+
+// GFX90A: ds_min_f64 v1, a[2:3]           ; encoding: [0x00,0x00,0xa4,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_f64 v1, a[2:3]
+
+// GFX90A: ds_min_f64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0xa4,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_f64 v1, a[2:3] offset:4
+
+// GFX90A: ds_min_f64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xa5,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_f64 v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_max_f64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xa6,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_f64 v1, a[2:3] offset:65535
+
+// GFX90A: ds_max_f64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xa6,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_f64 v255, a[2:3] offset:65535
+
+// GFX90A: ds_max_f64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xa6,0xda,0x01,0xfe,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_f64 v1, a[254:255] offset:65535
+
+// GFX90A: ds_max_f64 v1, a[2:3]           ; encoding: [0x00,0x00,0xa6,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_f64 v1, a[2:3]
+
+// GFX90A: ds_max_f64 v1, a[2:3]           ; encoding: [0x00,0x00,0xa6,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_f64 v1, a[2:3]
+
+// GFX90A: ds_max_f64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0xa6,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_f64 v1, a[2:3] offset:4
+
+// GFX90A: ds_max_f64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xa7,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_f64 v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_write_b8_d16_hi v1, a2 offset:65535 ; encoding: [0xff,0xff,0xa8,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b8_d16_hi v1, a2 offset:65535
+
+// GFX90A: ds_write_b8_d16_hi v255, a2 offset:65535 ; encoding: [0xff,0xff,0xa8,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b8_d16_hi v255, a2 offset:65535
+
+// GFX90A: ds_write_b8_d16_hi v1, a255 offset:65535 ; encoding: [0xff,0xff,0xa8,0xda,0x01,0xff,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b8_d16_hi v1, a255 offset:65535
+
+// GFX90A: ds_write_b8_d16_hi v1, a2       ; encoding: [0x00,0x00,0xa8,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b8_d16_hi v1, a2
+
+// GFX90A: ds_write_b8_d16_hi v1, a2       ; encoding: [0x00,0x00,0xa8,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b8_d16_hi v1, a2
+
+// GFX90A: ds_write_b8_d16_hi v1, a2 offset:4 ; encoding: [0x04,0x00,0xa8,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b8_d16_hi v1, a2 offset:4
+
+// GFX90A: ds_write_b8_d16_hi v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0xa9,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b8_d16_hi v1, a2 offset:65535 gds
+
+// GFX90A: ds_write_b16_d16_hi v1, a2 offset:65535 ; encoding: [0xff,0xff,0xaa,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b16_d16_hi v1, a2 offset:65535
+
+// GFX90A: ds_write_b16_d16_hi v255, a2 offset:65535 ; encoding: [0xff,0xff,0xaa,0xda,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b16_d16_hi v255, a2 offset:65535
+
+// GFX90A: ds_write_b16_d16_hi v1, a255 offset:65535 ; encoding: [0xff,0xff,0xaa,0xda,0x01,0xff,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b16_d16_hi v1, a255 offset:65535
+
+// GFX90A: ds_write_b16_d16_hi v1, a2      ; encoding: [0x00,0x00,0xaa,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b16_d16_hi v1, a2
+
+// GFX90A: ds_write_b16_d16_hi v1, a2      ; encoding: [0x00,0x00,0xaa,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b16_d16_hi v1, a2
+
+// GFX90A: ds_write_b16_d16_hi v1, a2 offset:4 ; encoding: [0x04,0x00,0xaa,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b16_d16_hi v1, a2 offset:4
+
+// GFX90A: ds_write_b16_d16_hi v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0xab,0xda,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b16_d16_hi v1, a2 offset:65535 gds
+
+// GFX90A: ds_read_u8_d16 a5, v1 offset:65535 ; encoding: [0xff,0xff,0xac,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u8_d16 a5, v1 offset:65535
+
+// GFX90A: ds_read_u8_d16 a255, v1 offset:65535 ; encoding: [0xff,0xff,0xac,0xda,0x01,0x00,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u8_d16 a255, v1 offset:65535
+
+// GFX90A: ds_read_u8_d16 a5, v255 offset:65535 ; encoding: [0xff,0xff,0xac,0xda,0xff,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u8_d16 a5, v255 offset:65535
+
+// GFX90A: ds_read_u8_d16 a5, v1           ; encoding: [0x00,0x00,0xac,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u8_d16 a5, v1
+
+// GFX90A: ds_read_u8_d16 a5, v1           ; encoding: [0x00,0x00,0xac,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u8_d16 a5, v1
+
+// GFX90A: ds_read_u8_d16 a5, v1 offset:4  ; encoding: [0x04,0x00,0xac,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u8_d16 a5, v1 offset:4
+
+// GFX90A: ds_read_u8_d16 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xad,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u8_d16 a5, v1 offset:65535 gds
+
+// GFX90A: ds_read_u8_d16_hi a5, v1 offset:65535 ; encoding: [0xff,0xff,0xae,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u8_d16_hi a5, v1 offset:65535
+
+// GFX90A: ds_read_u8_d16_hi a255, v1 offset:65535 ; encoding: [0xff,0xff,0xae,0xda,0x01,0x00,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u8_d16_hi a255, v1 offset:65535
+
+// GFX90A: ds_read_u8_d16_hi a5, v255 offset:65535 ; encoding: [0xff,0xff,0xae,0xda,0xff,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u8_d16_hi a5, v255 offset:65535
+
+// GFX90A: ds_read_u8_d16_hi a5, v1        ; encoding: [0x00,0x00,0xae,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u8_d16_hi a5, v1
+
+// GFX90A: ds_read_u8_d16_hi a5, v1        ; encoding: [0x00,0x00,0xae,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u8_d16_hi a5, v1
+
+// GFX90A: ds_read_u8_d16_hi a5, v1 offset:4 ; encoding: [0x04,0x00,0xae,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u8_d16_hi a5, v1 offset:4
+
+// GFX90A: ds_read_u8_d16_hi a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xaf,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u8_d16_hi a5, v1 offset:65535 gds
+
+// GFX90A: ds_read_i8_d16 a5, v1 offset:65535 ; encoding: [0xff,0xff,0xb0,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i8_d16 a5, v1 offset:65535
+
+// GFX90A: ds_read_i8_d16 a255, v1 offset:65535 ; encoding: [0xff,0xff,0xb0,0xda,0x01,0x00,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i8_d16 a255, v1 offset:65535
+
+// GFX90A: ds_read_i8_d16 a5, v255 offset:65535 ; encoding: [0xff,0xff,0xb0,0xda,0xff,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i8_d16 a5, v255 offset:65535
+
+// GFX90A: ds_read_i8_d16 a5, v1           ; encoding: [0x00,0x00,0xb0,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i8_d16 a5, v1
+
+// GFX90A: ds_read_i8_d16 a5, v1           ; encoding: [0x00,0x00,0xb0,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i8_d16 a5, v1
+
+// GFX90A: ds_read_i8_d16 a5, v1 offset:4  ; encoding: [0x04,0x00,0xb0,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i8_d16 a5, v1 offset:4
+
+// GFX90A: ds_read_i8_d16 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xb1,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i8_d16 a5, v1 offset:65535 gds
+
+// GFX90A: ds_read_i8_d16_hi a5, v1 offset:65535 ; encoding: [0xff,0xff,0xb2,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i8_d16_hi a5, v1 offset:65535
+
+// GFX90A: ds_read_i8_d16_hi a255, v1 offset:65535 ; encoding: [0xff,0xff,0xb2,0xda,0x01,0x00,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i8_d16_hi a255, v1 offset:65535
+
+// GFX90A: ds_read_i8_d16_hi a5, v255 offset:65535 ; encoding: [0xff,0xff,0xb2,0xda,0xff,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i8_d16_hi a5, v255 offset:65535
+
+// GFX90A: ds_read_i8_d16_hi a5, v1        ; encoding: [0x00,0x00,0xb2,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i8_d16_hi a5, v1
+
+// GFX90A: ds_read_i8_d16_hi a5, v1        ; encoding: [0x00,0x00,0xb2,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i8_d16_hi a5, v1
+
+// GFX90A: ds_read_i8_d16_hi a5, v1 offset:4 ; encoding: [0x04,0x00,0xb2,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i8_d16_hi a5, v1 offset:4
+
+// GFX90A: ds_read_i8_d16_hi a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xb3,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_i8_d16_hi a5, v1 offset:65535 gds
+
+// GFX90A: ds_read_u16_d16 a5, v1 offset:65535 ; encoding: [0xff,0xff,0xb4,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u16_d16 a5, v1 offset:65535
+
+// GFX90A: ds_read_u16_d16 a255, v1 offset:65535 ; encoding: [0xff,0xff,0xb4,0xda,0x01,0x00,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u16_d16 a255, v1 offset:65535
+
+// GFX90A: ds_read_u16_d16 a5, v255 offset:65535 ; encoding: [0xff,0xff,0xb4,0xda,0xff,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u16_d16 a5, v255 offset:65535
+
+// GFX90A: ds_read_u16_d16 a5, v1          ; encoding: [0x00,0x00,0xb4,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u16_d16 a5, v1
+
+// GFX90A: ds_read_u16_d16 a5, v1          ; encoding: [0x00,0x00,0xb4,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u16_d16 a5, v1
+
+// GFX90A: ds_read_u16_d16 a5, v1 offset:4 ; encoding: [0x04,0x00,0xb4,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u16_d16 a5, v1 offset:4
+
+// GFX90A: ds_read_u16_d16 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xb5,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u16_d16 a5, v1 offset:65535 gds
+
+// GFX90A: ds_read_u16_d16_hi a5, v1 offset:65535 ; encoding: [0xff,0xff,0xb6,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u16_d16_hi a5, v1 offset:65535
+
+// GFX90A: ds_read_u16_d16_hi a255, v1 offset:65535 ; encoding: [0xff,0xff,0xb6,0xda,0x01,0x00,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u16_d16_hi a255, v1 offset:65535
+
+// GFX90A: ds_read_u16_d16_hi a5, v255 offset:65535 ; encoding: [0xff,0xff,0xb6,0xda,0xff,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u16_d16_hi a5, v255 offset:65535
+
+// GFX90A: ds_read_u16_d16_hi a5, v1       ; encoding: [0x00,0x00,0xb6,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u16_d16_hi a5, v1
+
+// GFX90A: ds_read_u16_d16_hi a5, v1       ; encoding: [0x00,0x00,0xb6,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u16_d16_hi a5, v1
+
+// GFX90A: ds_read_u16_d16_hi a5, v1 offset:4 ; encoding: [0x04,0x00,0xb6,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u16_d16_hi a5, v1 offset:4
+
+// GFX90A: ds_read_u16_d16_hi a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xb7,0xda,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_u16_d16_hi a5, v1 offset:65535 gds
+
+// GFX90A: ds_add_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc0,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_u64 a[6:7], v1, a[2:3] offset:65535
+
+// GFX90A: ds_add_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc0,0xda,0x01,0x02,0x00,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_u64 a[254:255], v1, a[2:3] offset:65535
+
+// GFX90A: ds_add_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc0,0xda,0xff,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_u64 a[6:7], v255, a[2:3] offset:65535
+
+// GFX90A: ds_add_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xc0,0xda,0x01,0xfe,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_u64 a[6:7], v1, a[254:255] offset:65535
+
+// GFX90A: ds_add_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc0,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_u64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_add_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc0,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_u64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_add_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xc0,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_u64 a[6:7], v1, a[2:3] offset:4
+
+// GFX90A: ds_add_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xc1,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_add_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc2,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_rtn_u64 a[6:7], v1, a[2:3] offset:65535
+
+// GFX90A: ds_sub_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc2,0xda,0x01,0x02,0x00,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_rtn_u64 a[254:255], v1, a[2:3] offset:65535
+
+// GFX90A: ds_sub_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc2,0xda,0xff,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_rtn_u64 a[6:7], v255, a[2:3] offset:65535
+
+// GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xc2,0xda,0x01,0xfe,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_rtn_u64 a[6:7], v1, a[254:255] offset:65535
+
+// GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc2,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_rtn_u64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc2,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_rtn_u64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xc2,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_rtn_u64 a[6:7], v1, a[2:3] offset:4
+
+// GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xc3,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_sub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc4,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_rtn_u64 a[6:7], v1, a[2:3] offset:65535
+
+// GFX90A: ds_rsub_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc4,0xda,0x01,0x02,0x00,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_rtn_u64 a[254:255], v1, a[2:3] offset:65535
+
+// GFX90A: ds_rsub_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc4,0xda,0xff,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_rtn_u64 a[6:7], v255, a[2:3] offset:65535
+
+// GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xc4,0xda,0x01,0xfe,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_rtn_u64 a[6:7], v1, a[254:255] offset:65535
+
+// GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc4,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_rtn_u64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc4,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_rtn_u64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xc4,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_rtn_u64 a[6:7], v1, a[2:3] offset:4
+
+// GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xc5,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_rsub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc6,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_rtn_u64 a[6:7], v1, a[2:3] offset:65535
+
+// GFX90A: ds_inc_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc6,0xda,0x01,0x02,0x00,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_rtn_u64 a[254:255], v1, a[2:3] offset:65535
+
+// GFX90A: ds_inc_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc6,0xda,0xff,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_rtn_u64 a[6:7], v255, a[2:3] offset:65535
+
+// GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xc6,0xda,0x01,0xfe,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_rtn_u64 a[6:7], v1, a[254:255] offset:65535
+
+// GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc6,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_rtn_u64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc6,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_rtn_u64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xc6,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_rtn_u64 a[6:7], v1, a[2:3] offset:4
+
+// GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xc7,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_inc_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc8,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_rtn_u64 a[6:7], v1, a[2:3] offset:65535
+
+// GFX90A: ds_dec_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc8,0xda,0x01,0x02,0x00,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_rtn_u64 a[254:255], v1, a[2:3] offset:65535
+
+// GFX90A: ds_dec_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc8,0xda,0xff,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_rtn_u64 a[6:7], v255, a[2:3] offset:65535
+
+// GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xc8,0xda,0x01,0xfe,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_rtn_u64 a[6:7], v1, a[254:255] offset:65535
+
+// GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc8,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_rtn_u64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc8,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_rtn_u64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xc8,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_rtn_u64 a[6:7], v1, a[2:3] offset:4
+
+// GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xc9,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_dec_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_min_rtn_i64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xca,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_i64 a[6:7], v1, a[2:3] offset:65535
+
+// GFX90A: ds_min_rtn_i64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xca,0xda,0x01,0x02,0x00,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_i64 a[254:255], v1, a[2:3] offset:65535
+
+// GFX90A: ds_min_rtn_i64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xca,0xda,0xff,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_i64 a[6:7], v255, a[2:3] offset:65535
+
+// GFX90A: ds_min_rtn_i64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xca,0xda,0x01,0xfe,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_i64 a[6:7], v1, a[254:255] offset:65535
+
+// GFX90A: ds_min_rtn_i64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xca,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_i64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_min_rtn_i64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xca,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_i64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_min_rtn_i64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xca,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_i64 a[6:7], v1, a[2:3] offset:4
+
+// GFX90A: ds_min_rtn_i64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xcb,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_i64 a[6:7], v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_max_rtn_i64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xcc,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_i64 a[6:7], v1, a[2:3] offset:65535
+
+// GFX90A: ds_max_rtn_i64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xcc,0xda,0x01,0x02,0x00,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_i64 a[254:255], v1, a[2:3] offset:65535
+
+// GFX90A: ds_max_rtn_i64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xcc,0xda,0xff,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_i64 a[6:7], v255, a[2:3] offset:65535
+
+// GFX90A: ds_max_rtn_i64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xcc,0xda,0x01,0xfe,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_i64 a[6:7], v1, a[254:255] offset:65535
+
+// GFX90A: ds_max_rtn_i64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xcc,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_i64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_max_rtn_i64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xcc,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_i64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_max_rtn_i64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xcc,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_i64 a[6:7], v1, a[2:3] offset:4
+
+// GFX90A: ds_max_rtn_i64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xcd,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_i64 a[6:7], v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_min_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xce,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_u64 a[6:7], v1, a[2:3] offset:65535
+
+// GFX90A: ds_min_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xce,0xda,0x01,0x02,0x00,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_u64 a[254:255], v1, a[2:3] offset:65535
+
+// GFX90A: ds_min_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xce,0xda,0xff,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_u64 a[6:7], v255, a[2:3] offset:65535
+
+// GFX90A: ds_min_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xce,0xda,0x01,0xfe,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_u64 a[6:7], v1, a[254:255] offset:65535
+
+// GFX90A: ds_min_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xce,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_u64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_min_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xce,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_u64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_min_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xce,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_u64 a[6:7], v1, a[2:3] offset:4
+
+// GFX90A: ds_min_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xcf,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_max_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd0,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_u64 a[6:7], v1, a[2:3] offset:65535
+
+// GFX90A: ds_max_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd0,0xda,0x01,0x02,0x00,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_u64 a[254:255], v1, a[2:3] offset:65535
+
+// GFX90A: ds_max_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd0,0xda,0xff,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_u64 a[6:7], v255, a[2:3] offset:65535
+
+// GFX90A: ds_max_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xd0,0xda,0x01,0xfe,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_u64 a[6:7], v1, a[254:255] offset:65535
+
+// GFX90A: ds_max_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd0,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_u64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_max_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd0,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_u64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_max_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xd0,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_u64 a[6:7], v1, a[2:3] offset:4
+
+// GFX90A: ds_max_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xd1,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_and_rtn_b64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd2,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_rtn_b64 a[6:7], v1, a[2:3] offset:65535
+
+// GFX90A: ds_and_rtn_b64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd2,0xda,0x01,0x02,0x00,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_rtn_b64 a[254:255], v1, a[2:3] offset:65535
+
+// GFX90A: ds_and_rtn_b64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd2,0xda,0xff,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_rtn_b64 a[6:7], v255, a[2:3] offset:65535
+
+// GFX90A: ds_and_rtn_b64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xd2,0xda,0x01,0xfe,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_rtn_b64 a[6:7], v1, a[254:255] offset:65535
+
+// GFX90A: ds_and_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd2,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_rtn_b64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_and_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd2,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_rtn_b64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_and_rtn_b64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xd2,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_rtn_b64 a[6:7], v1, a[2:3] offset:4
+
+// GFX90A: ds_and_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xd3,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_and_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_or_rtn_b64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd4,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_rtn_b64 a[6:7], v1, a[2:3] offset:65535
+
+// GFX90A: ds_or_rtn_b64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd4,0xda,0x01,0x02,0x00,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_rtn_b64 a[254:255], v1, a[2:3] offset:65535
+
+// GFX90A: ds_or_rtn_b64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd4,0xda,0xff,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_rtn_b64 a[6:7], v255, a[2:3] offset:65535
+
+// GFX90A: ds_or_rtn_b64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xd4,0xda,0x01,0xfe,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_rtn_b64 a[6:7], v1, a[254:255] offset:65535
+
+// GFX90A: ds_or_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd4,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_rtn_b64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_or_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd4,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_rtn_b64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_or_rtn_b64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xd4,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_rtn_b64 a[6:7], v1, a[2:3] offset:4
+
+// GFX90A: ds_or_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xd5,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_or_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd6,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_rtn_b64 a[6:7], v1, a[2:3] offset:65535
+
+// GFX90A: ds_xor_rtn_b64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd6,0xda,0x01,0x02,0x00,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_rtn_b64 a[254:255], v1, a[2:3] offset:65535
+
+// GFX90A: ds_xor_rtn_b64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd6,0xda,0xff,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_rtn_b64 a[6:7], v255, a[2:3] offset:65535
+
+// GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xd6,0xda,0x01,0xfe,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_rtn_b64 a[6:7], v1, a[254:255] offset:65535
+
+// GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd6,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_rtn_b64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd6,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_rtn_b64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xd6,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_rtn_b64 a[6:7], v1, a[2:3] offset:4
+
+// GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xd7,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_xor_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xd8,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535
+
+// GFX90A: ds_mskor_rtn_b64 a[254:255], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xd8,0xda,0x01,0x02,0x04,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_rtn_b64 a[254:255], v1, a[2:3], a[4:5] offset:65535
+
+// GFX90A: ds_mskor_rtn_b64 a[6:7], v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xd8,0xda,0xff,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_rtn_b64 a[6:7], v255, a[2:3], a[4:5] offset:65535
+
+// GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xd8,0xda,0x01,0xfe,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_rtn_b64 a[6:7], v1, a[254:255], a[4:5] offset:65535
+
+// GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0xd8,0xda,0x01,0x02,0xfe,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[254:255] offset:65535
+
+// GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xd8,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5]
+
+// GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xd8,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5]
+
+// GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0xd8,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:4
+
+// GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0xd9,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 gds
+
+// GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xda,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] offset:65535
+
+// GFX90A: ds_wrxchg_rtn_b64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xda,0xda,0x01,0x02,0x00,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg_rtn_b64 a[254:255], v1, a[2:3] offset:65535
+
+// GFX90A: ds_wrxchg_rtn_b64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xda,0xda,0xff,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg_rtn_b64 a[6:7], v255, a[2:3] offset:65535
+
+// GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xda,0xda,0x01,0xfe,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg_rtn_b64 a[6:7], v1, a[254:255] offset:65535
+
+// GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xda,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xda,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xda,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] offset:4
+
+// GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xdb,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255
+
+// GFX90A: ds_wrxchg2_rtn_b64 a[252:255], v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xda,0x01,0x02,0x04,0xfc]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b64 a[252:255], v1, a[2:3], a[4:5] offset0:127 offset1:255
+
+// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v255, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xda,0xff,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b64 a[6:9], v255, a[2:3], a[4:5] offset0:127 offset1:255
+
+// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[254:255], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xda,0x01,0xfe,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b64 a[6:9], v1, a[254:255], a[4:5] offset0:127 offset1:255
+
+// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[254:255] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xda,0x01,0x02,0xfe,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[254:255] offset0:127 offset1:255
+
+// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0xdc,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255
+
+// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0xdc,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255
+
+// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:16 offset1:255 ; encoding: [0x10,0xff,0xdc,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:16 offset1:255
+
+// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0xdc,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127
+
+// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0xdc,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127
+
+// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:1 ; encoding: [0x7f,0x01,0xdc,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:1
+
+// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0xdd,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 gds
+
+// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xde,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255
+
+// GFX90A: ds_wrxchg2st64_rtn_b64 a[252:255], v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xde,0xda,0x01,0x02,0x04,0xfc]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b64 a[252:255], v1, a[2:3], a[4:5] offset0:127 offset1:255
+
+// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v255, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xde,0xda,0xff,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b64 a[6:9], v255, a[2:3], a[4:5] offset0:127 offset1:255
+
+// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[254:255], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xde,0xda,0x01,0xfe,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[254:255], a[4:5] offset0:127 offset1:255
+
+// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[254:255] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xde,0xda,0x01,0x02,0xfe,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[254:255] offset0:127 offset1:255
+
+// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0xde,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255
+
+// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0xde,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255
+
+// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:16 offset1:255 ; encoding: [0x10,0xff,0xde,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:16 offset1:255
+
+// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0xde,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127
+
+// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0xde,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127
+
+// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:1 ; encoding: [0x7f,0x01,0xde,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:1
+
+// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0xdf,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 gds
+
+// GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe0,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535
+
+// GFX90A: ds_cmpst_rtn_b64 a[254:255], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe0,0xda,0x01,0x02,0x04,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_b64 a[254:255], v1, a[2:3], a[4:5] offset:65535
+
+// GFX90A: ds_cmpst_rtn_b64 a[6:7], v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe0,0xda,0xff,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_b64 a[6:7], v255, a[2:3], a[4:5] offset:65535
+
+// GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe0,0xda,0x01,0xfe,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_b64 a[6:7], v1, a[254:255], a[4:5] offset:65535
+
+// GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0xe0,0xda,0x01,0x02,0xfe,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[254:255] offset:65535
+
+// GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xe0,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5]
+
+// GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xe0,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5]
+
+// GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0xe0,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:4
+
+// GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0xe1,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 gds
+
+// GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe2,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] offset:65535
+
+// GFX90A: ds_cmpst_rtn_f64 a[254:255], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe2,0xda,0x01,0x02,0x04,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_f64 a[254:255], v1, a[2:3], a[4:5] offset:65535
+
+// GFX90A: ds_cmpst_rtn_f64 a[6:7], v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe2,0xda,0xff,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_f64 a[6:7], v255, a[2:3], a[4:5] offset:65535
+
+// GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe2,0xda,0x01,0xfe,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_f64 a[6:7], v1, a[254:255], a[4:5] offset:65535
+
+// GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0xe2,0xda,0x01,0x02,0xfe,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[254:255] offset:65535
+
+// GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xe2,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5]
+
+// GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xe2,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5]
+
+// GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0xe2,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] offset:4
+
+// GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0xe3,0xda,0x01,0x02,0x04,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] offset:65535 gds
+
+// GFX90A: ds_min_rtn_f64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe4,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_f64 a[6:7], v1, a[2:3] offset:65535
+
+// GFX90A: ds_min_rtn_f64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe4,0xda,0x01,0x02,0x00,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_f64 a[254:255], v1, a[2:3] offset:65535
+
+// GFX90A: ds_min_rtn_f64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe4,0xda,0xff,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_f64 a[6:7], v255, a[2:3] offset:65535
+
+// GFX90A: ds_min_rtn_f64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xe4,0xda,0x01,0xfe,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_f64 a[6:7], v1, a[254:255] offset:65535
+
+// GFX90A: ds_min_rtn_f64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xe4,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_f64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_min_rtn_f64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xe4,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_f64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_min_rtn_f64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xe4,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_f64 a[6:7], v1, a[2:3] offset:4
+
+// GFX90A: ds_min_rtn_f64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xe5,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_min_rtn_f64 a[6:7], v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_max_rtn_f64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe6,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_f64 a[6:7], v1, a[2:3] offset:65535
+
+// GFX90A: ds_max_rtn_f64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe6,0xda,0x01,0x02,0x00,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_f64 a[254:255], v1, a[2:3] offset:65535
+
+// GFX90A: ds_max_rtn_f64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe6,0xda,0xff,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_f64 a[6:7], v255, a[2:3] offset:65535
+
+// GFX90A: ds_max_rtn_f64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xe6,0xda,0x01,0xfe,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_f64 a[6:7], v1, a[254:255] offset:65535
+
+// GFX90A: ds_max_rtn_f64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xe6,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_f64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_max_rtn_f64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xe6,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_f64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_max_rtn_f64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xe6,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_f64 a[6:7], v1, a[2:3] offset:4
+
+// GFX90A: ds_max_rtn_f64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xe7,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_max_rtn_f64 a[6:7], v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_read_b64 a[6:7], v1 offset:65535 ; encoding: [0xff,0xff,0xec,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b64 a[6:7], v1 offset:65535
+
+// GFX90A: ds_read_b64 a[254:255], v1 offset:65535 ; encoding: [0xff,0xff,0xec,0xda,0x01,0x00,0x00,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b64 a[254:255], v1 offset:65535
+
+// GFX90A: ds_read_b64 a[6:7], v255 offset:65535 ; encoding: [0xff,0xff,0xec,0xda,0xff,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b64 a[6:7], v255 offset:65535
+
+// GFX90A: ds_read_b64 a[6:7], v1          ; encoding: [0x00,0x00,0xec,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b64 a[6:7], v1
+
+// GFX90A: ds_read_b64 a[6:7], v1          ; encoding: [0x00,0x00,0xec,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b64 a[6:7], v1
+
+// GFX90A: ds_read_b64 a[6:7], v1 offset:4 ; encoding: [0x04,0x00,0xec,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b64 a[6:7], v1 offset:4
+
+// GFX90A: ds_read_b64 a[6:7], v1 offset:65535 gds ; encoding: [0xff,0xff,0xed,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b64 a[6:7], v1 offset:65535 gds
+
+// GFX90A: ds_read2_b64 a[6:9], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xee,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2_b64 a[6:9], v1 offset0:127 offset1:255
+
+// GFX90A: ds_read2_b64 a[252:255], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xee,0xda,0x01,0x00,0x00,0xfc]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2_b64 a[252:255], v1 offset0:127 offset1:255
+
+// GFX90A: ds_read2_b64 a[6:9], v255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xee,0xda,0xff,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2_b64 a[6:9], v255 offset0:127 offset1:255
+
+// GFX90A: ds_read2_b64 a[6:9], v1 offset1:255 ; encoding: [0x00,0xff,0xee,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2_b64 a[6:9], v1 offset1:255
+
+// GFX90A: ds_read2_b64 a[6:9], v1 offset1:255 ; encoding: [0x00,0xff,0xee,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2_b64 a[6:9], v1 offset1:255
+
+// GFX90A: ds_read2_b64 a[6:9], v1 offset0:16 offset1:255 ; encoding: [0x10,0xff,0xee,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2_b64 a[6:9], v1 offset0:16 offset1:255
+
+// GFX90A: ds_read2_b64 a[6:9], v1 offset0:127 ; encoding: [0x7f,0x00,0xee,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2_b64 a[6:9], v1 offset0:127
+
+// GFX90A: ds_read2_b64 a[6:9], v1 offset0:127 ; encoding: [0x7f,0x00,0xee,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2_b64 a[6:9], v1 offset0:127
+
+// GFX90A: ds_read2_b64 a[6:9], v1 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0xee,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2_b64 a[6:9], v1 offset0:127 offset1:1
+
+// GFX90A: ds_read2_b64 a[6:9], v1 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0xef,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2_b64 a[6:9], v1 offset0:127 offset1:255 gds
+
+// GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xf0,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2st64_b64 a[6:9], v1 offset0:127 offset1:255
+
+// GFX90A: ds_read2st64_b64 a[252:255], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xf0,0xda,0x01,0x00,0x00,0xfc]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2st64_b64 a[252:255], v1 offset0:127 offset1:255
+
+// GFX90A: ds_read2st64_b64 a[6:9], v255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xf0,0xda,0xff,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2st64_b64 a[6:9], v255 offset0:127 offset1:255
+
+// GFX90A: ds_read2st64_b64 a[6:9], v1 offset1:255 ; encoding: [0x00,0xff,0xf0,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2st64_b64 a[6:9], v1 offset1:255
+
+// GFX90A: ds_read2st64_b64 a[6:9], v1 offset1:255 ; encoding: [0x00,0xff,0xf0,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2st64_b64 a[6:9], v1 offset1:255
+
+// GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:16 offset1:255 ; encoding: [0x10,0xff,0xf0,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2st64_b64 a[6:9], v1 offset0:16 offset1:255
+
+// GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:127 ; encoding: [0x7f,0x00,0xf0,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2st64_b64 a[6:9], v1 offset0:127
+
+// GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:127 ; encoding: [0x7f,0x00,0xf0,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2st64_b64 a[6:9], v1 offset0:127
+
+// GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0xf0,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2st64_b64 a[6:9], v1 offset0:127 offset1:1
+
+// GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0xf1,0xda,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read2st64_b64 a[6:9], v1 offset0:127 offset1:255 gds
+
+// GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xfc,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] offset:65535
+
+// GFX90A: ds_condxchg32_rtn_b64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xfc,0xda,0x01,0x02,0x00,0xfe]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_condxchg32_rtn_b64 a[254:255], v1, a[2:3] offset:65535
+
+// GFX90A: ds_condxchg32_rtn_b64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xfc,0xda,0xff,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_condxchg32_rtn_b64 a[6:7], v255, a[2:3] offset:65535
+
+// GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xfc,0xda,0x01,0xfe,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_condxchg32_rtn_b64 a[6:7], v1, a[254:255] offset:65535
+
+// GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xfc,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xfc,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3]
+
+// GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xfc,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] offset:4
+
+// GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xfd,0xda,0x01,0x02,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds
+
+// GFX90A: ds_gws_init a1 offset:65535 gds ; encoding: [0xff,0xff,0x33,0xdb,0x01,0x00,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_gws_init a1 offset:65535 gds
+
+// GFX90A: ds_gws_init a255 offset:65535 gds ; encoding: [0xff,0xff,0x33,0xdb,0xff,0x00,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_gws_init a255 offset:65535 gds
+
+// GFX90A: ds_gws_init a1 gds              ; encoding: [0x00,0x00,0x33,0xdb,0x01,0x00,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_gws_init a1 gds
+
+// GFX90A: ds_gws_init a1 gds              ; encoding: [0x00,0x00,0x33,0xdb,0x01,0x00,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_gws_init a1 gds
+
+// GFX90A: ds_gws_init a1 offset:4 gds     ; encoding: [0x04,0x00,0x33,0xdb,0x01,0x00,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_gws_init a1 offset:4 gds
+
+// GFX90A: ds_gws_sema_br a1 offset:65535 gds ; encoding: [0xff,0xff,0x37,0xdb,0x01,0x00,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_gws_sema_br a1 offset:65535 gds
+
+// GFX90A: ds_gws_sema_br a255 offset:65535 gds ; encoding: [0xff,0xff,0x37,0xdb,0xff,0x00,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_gws_sema_br a255 offset:65535 gds
+
+// GFX90A: ds_gws_sema_br a1 gds           ; encoding: [0x00,0x00,0x37,0xdb,0x01,0x00,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_gws_sema_br a1 gds
+
+// GFX90A: ds_gws_sema_br a1 gds           ; encoding: [0x00,0x00,0x37,0xdb,0x01,0x00,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_gws_sema_br a1 gds
+
+// GFX90A: ds_gws_sema_br a1 offset:4 gds  ; encoding: [0x04,0x00,0x37,0xdb,0x01,0x00,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_gws_sema_br a1 offset:4 gds
+
+// GFX90A: ds_gws_barrier a1 offset:65535 gds ; encoding: [0xff,0xff,0x3b,0xdb,0x01,0x00,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_gws_barrier a1 offset:65535 gds
+
+// GFX90A: ds_gws_barrier a255 offset:65535 gds ; encoding: [0xff,0xff,0x3b,0xdb,0xff,0x00,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_gws_barrier a255 offset:65535 gds
+
+// GFX90A: ds_gws_barrier a1 gds           ; encoding: [0x00,0x00,0x3b,0xdb,0x01,0x00,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_gws_barrier a1 gds
+
+// GFX90A: ds_gws_barrier a1 gds           ; encoding: [0x00,0x00,0x3b,0xdb,0x01,0x00,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_gws_barrier a1 gds
+
+// GFX90A: ds_gws_barrier a1 offset:4 gds  ; encoding: [0x04,0x00,0x3b,0xdb,0x01,0x00,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_gws_barrier a1 offset:4 gds
+
+// GFX90A: ds_consume a5 offset:65535      ; encoding: [0xff,0xff,0x7a,0xdb,0x00,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_consume a5 offset:65535
+
+// GFX90A: ds_consume a255 offset:65535    ; encoding: [0xff,0xff,0x7a,0xdb,0x00,0x00,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_consume a255 offset:65535
+
+// GFX90A: ds_consume a5                   ; encoding: [0x00,0x00,0x7a,0xdb,0x00,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_consume a5
+
+// GFX90A: ds_consume a5                   ; encoding: [0x00,0x00,0x7a,0xdb,0x00,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_consume a5
+
+// GFX90A: ds_consume a5 offset:4          ; encoding: [0x04,0x00,0x7a,0xdb,0x00,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_consume a5 offset:4
+
+// GFX90A: ds_consume a5 offset:65535 gds  ; encoding: [0xff,0xff,0x7b,0xdb,0x00,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_consume a5 offset:65535 gds
+
+// GFX90A: ds_append a5 offset:65535       ; encoding: [0xff,0xff,0x7c,0xdb,0x00,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_append a5 offset:65535
+
+// GFX90A: ds_append a255 offset:65535     ; encoding: [0xff,0xff,0x7c,0xdb,0x00,0x00,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_append a255 offset:65535
+
+// GFX90A: ds_append a5                    ; encoding: [0x00,0x00,0x7c,0xdb,0x00,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_append a5
+
+// GFX90A: ds_append a5                    ; encoding: [0x00,0x00,0x7c,0xdb,0x00,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_append a5
+
+// GFX90A: ds_append a5 offset:4           ; encoding: [0x04,0x00,0x7c,0xdb,0x00,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_append a5 offset:4
+
+// GFX90A: ds_append a5 offset:65535 gds   ; encoding: [0xff,0xff,0x7d,0xdb,0x00,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_append a5 offset:65535 gds
+
+// GFX90A: ds_ordered_count a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x7f,0xdb,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_ordered_count a5, v1 offset:65535 gds
+
+// GFX90A: ds_ordered_count a255, v1 offset:65535 gds ; encoding: [0xff,0xff,0x7f,0xdb,0x01,0x00,0x00,0xff]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_ordered_count a255, v1 offset:65535 gds
+
+// GFX90A: ds_ordered_count a5, v255 offset:65535 gds ; encoding: [0xff,0xff,0x7f,0xdb,0xff,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_ordered_count a5, v255 offset:65535 gds
+
+// GFX90A: ds_ordered_count a5, v1 gds     ; encoding: [0x00,0x00,0x7f,0xdb,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_ordered_count a5, v1 gds
+
+// GFX90A: ds_ordered_count a5, v1 gds     ; encoding: [0x00,0x00,0x7f,0xdb,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_ordered_count a5, v1 gds
+
+// GFX90A: ds_ordered_count a5, v1 offset:4 gds ; encoding: [0x04,0x00,0x7f,0xdb,0x01,0x00,0x00,0x05]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_ordered_count a5, v1 offset:4 gds
+
+// GFX90A: ds_write_b96 v1, a[2:4] offset:65535 ; encoding: [0xff,0xff,0xbc,0xdb,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b96 v1, a[2:4] offset:65535
+
+// GFX90A: ds_write_b96 v255, a[2:4] offset:65535 ; encoding: [0xff,0xff,0xbc,0xdb,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b96 v255, a[2:4] offset:65535
+
+// GFX90A: ds_write_b96 v1, a[252:254] offset:65535 ; encoding: [0xff,0xff,0xbc,0xdb,0x01,0xfc,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b96 v1, a[252:254] offset:65535
+
+// GFX90A: ds_write_b96 v1, a[2:4]         ; encoding: [0x00,0x00,0xbc,0xdb,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b96 v1, a[2:4]
+
+// GFX90A: ds_write_b96 v1, a[2:4]         ; encoding: [0x00,0x00,0xbc,0xdb,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b96 v1, a[2:4]
+
+// GFX90A: ds_write_b96 v1, a[2:4] offset:4 ; encoding: [0x04,0x00,0xbc,0xdb,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b96 v1, a[2:4] offset:4
+
+// GFX90A: ds_write_b96 v1, a[2:4] offset:65535 gds ; encoding: [0xff,0xff,0xbd,0xdb,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b96 v1, a[2:4] offset:65535 gds
+
+// GFX90A: ds_write_b128 v1, a[2:5] offset:65535 ; encoding: [0xff,0xff,0xbe,0xdb,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b128 v1, a[2:5] offset:65535
+
+// GFX90A: ds_write_b128 v255, a[2:5] offset:65535 ; encoding: [0xff,0xff,0xbe,0xdb,0xff,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b128 v255, a[2:5] offset:65535
+
+// GFX90A: ds_write_b128 v1, a[252:255] offset:65535 ; encoding: [0xff,0xff,0xbe,0xdb,0x01,0xfc,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b128 v1, a[252:255] offset:65535
+
+// GFX90A: ds_write_b128 v1, a[2:5]        ; encoding: [0x00,0x00,0xbe,0xdb,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b128 v1, a[2:5]
+
+// GFX90A: ds_write_b128 v1, a[2:5]        ; encoding: [0x00,0x00,0xbe,0xdb,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b128 v1, a[2:5]
+
+// GFX90A: ds_write_b128 v1, a[2:5] offset:4 ; encoding: [0x04,0x00,0xbe,0xdb,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b128 v1, a[2:5] offset:4
+
+// GFX90A: ds_write_b128 v1, a[2:5] offset:65535 gds ; encoding: [0xff,0xff,0xbf,0xdb,0x01,0x02,0x00,0x00]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_write_b128 v1, a[2:5] offset:65535 gds
+
+// GFX90A: ds_read_b96 a[6:8], v1 offset:65535 ; encoding: [0xff,0xff,0xfc,0xdb,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b96 a[6:8], v1 offset:65535
+
+// GFX90A: ds_read_b96 a[252:254], v1 offset:65535 ; encoding: [0xff,0xff,0xfc,0xdb,0x01,0x00,0x00,0xfc]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b96 a[252:254], v1 offset:65535
+
+// GFX90A: ds_read_b96 a[6:8], v255 offset:65535 ; encoding: [0xff,0xff,0xfc,0xdb,0xff,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b96 a[6:8], v255 offset:65535
+
+// GFX90A: ds_read_b96 a[6:8], v1          ; encoding: [0x00,0x00,0xfc,0xdb,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b96 a[6:8], v1
+
+// GFX90A: ds_read_b96 a[6:8], v1          ; encoding: [0x00,0x00,0xfc,0xdb,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b96 a[6:8], v1
+
+// GFX90A: ds_read_b96 a[6:8], v1 offset:4 ; encoding: [0x04,0x00,0xfc,0xdb,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b96 a[6:8], v1 offset:4
+
+// GFX90A: ds_read_b96 a[6:8], v1 offset:65535 gds ; encoding: [0xff,0xff,0xfd,0xdb,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b96 a[6:8], v1 offset:65535 gds
+
+// GFX90A: ds_read_b128 a[6:9], v1 offset:65535 ; encoding: [0xff,0xff,0xfe,0xdb,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b128 a[6:9], v1 offset:65535
+
+// GFX90A: ds_read_b128 a[252:255], v1 offset:65535 ; encoding: [0xff,0xff,0xfe,0xdb,0x01,0x00,0x00,0xfc]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b128 a[252:255], v1 offset:65535
+
+// GFX90A: ds_read_b128 a[6:9], v255 offset:65535 ; encoding: [0xff,0xff,0xfe,0xdb,0xff,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b128 a[6:9], v255 offset:65535
+
+// GFX90A: ds_read_b128 a[6:9], v1         ; encoding: [0x00,0x00,0xfe,0xdb,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b128 a[6:9], v1
+
+// GFX90A: ds_read_b128 a[6:9], v1         ; encoding: [0x00,0x00,0xfe,0xdb,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b128 a[6:9], v1
+
+// GFX90A: ds_read_b128 a[6:9], v1 offset:4 ; encoding: [0x04,0x00,0xfe,0xdb,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b128 a[6:9], v1 offset:4
+
+// GFX90A: ds_read_b128 a[6:9], v1 offset:65535 gds ; encoding: [0xff,0xff,0xff,0xdb,0x01,0x00,0x00,0x06]
+// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU
+ds_read_b128 a[6:9], v1 offset:65535 gds
+
+// GFX90A: image_load a5, v[2:5], s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x01,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a5, v[2:5], s[8:15] dmask:0x1
+
+// GFX90A: image_load a252, v[2:5], s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x01,0xf0,0x02,0xfc,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a252, v[2:5], s[8:15] dmask:0x1
+
+// GFX90A: image_load a5, v[252:255], s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x01,0xf0,0xfc,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a5, v[252:255], s[8:15] dmask:0x1
+
+// GFX90A: image_load a5, v[2:5], s[12:19] dmask:0x1 ; encoding: [0x00,0x01,0x01,0xf0,0x02,0x05,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a5, v[2:5], s[12:19] dmask:0x1
+
+// GFX90A: image_load a5, v[2:5], s[92:99] dmask:0x1 ; encoding: [0x00,0x01,0x01,0xf0,0x02,0x05,0x17,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a5, v[2:5], s[92:99] dmask:0x1
+
+// GFX90A: image_load a5, v[2:5], s[8:15] dmask:0x2 ; encoding: [0x00,0x02,0x01,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a5, v[2:5], s[8:15] dmask:0x2
+
+// GFX90A: image_load a[6:7], v[2:5], s[8:15] dmask:0x3 ; encoding: [0x00,0x03,0x01,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a[6:7], v[2:5], s[8:15] dmask:0x3
+
+// GFX90A: image_load a5, v[2:5], s[8:15] dmask:0x4 ; encoding: [0x00,0x04,0x01,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a5, v[2:5], s[8:15] dmask:0x4
+
+// GFX90A: image_load a[6:7], v[2:5], s[8:15] dmask:0x5 ; encoding: [0x00,0x05,0x01,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a[6:7], v[2:5], s[8:15] dmask:0x5
+
+// GFX90A: image_load a[6:7], v[2:5], s[8:15] dmask:0x6 ; encoding: [0x00,0x06,0x01,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a[6:7], v[2:5], s[8:15] dmask:0x6
+
+// GFX90A: image_load a[6:8], v[2:5], s[8:15] dmask:0x7 ; encoding: [0x00,0x07,0x01,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a[6:8], v[2:5], s[8:15] dmask:0x7
+
+// GFX90A: image_load a5, v[2:5], s[8:15] dmask:0x8 ; encoding: [0x00,0x08,0x01,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a5, v[2:5], s[8:15] dmask:0x8
+
+// GFX90A: image_load a[6:7], v[2:5], s[8:15] dmask:0x9 ; encoding: [0x00,0x09,0x01,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a[6:7], v[2:5], s[8:15] dmask:0x9
+
+// GFX90A: image_load a[6:7], v[2:5], s[8:15] dmask:0xa ; encoding: [0x00,0x0a,0x01,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a[6:7], v[2:5], s[8:15] dmask:0xa
+
+// GFX90A: image_load a[6:8], v[2:5], s[8:15] dmask:0xb ; encoding: [0x00,0x0b,0x01,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a[6:8], v[2:5], s[8:15] dmask:0xb
+
+// GFX90A: image_load a[6:7], v[2:5], s[8:15] dmask:0xc ; encoding: [0x00,0x0c,0x01,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a[6:7], v[2:5], s[8:15] dmask:0xc
+
+// GFX90A: image_load a[6:8], v[2:5], s[8:15] dmask:0xd ; encoding: [0x00,0x0d,0x01,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a[6:8], v[2:5], s[8:15] dmask:0xd
+
+// GFX90A: image_load a[6:8], v[2:5], s[8:15] dmask:0xe ; encoding: [0x00,0x0e,0x01,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a[6:8], v[2:5], s[8:15] dmask:0xe
+
+// GFX90A: image_load a5, v[2:5], s[8:15]  ; encoding: [0x00,0x00,0x01,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a5, v[2:5], s[8:15]
+
+// GFX90A: image_load a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x01,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a5, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_load a5, v[2:5], s[8:15] dmask:0x1 glc ; encoding: [0x00,0x21,0x01,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a5, v[2:5], s[8:15] dmask:0x1 glc
+
+// GFX90A: image_load a5, v[2:5], s[8:15] dmask:0x1 slc ; encoding: [0x00,0x01,0x01,0xf2,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a5, v[2:5], s[8:15] dmask:0x1 slc
+
+// GFX90A: image_load a5, v[2:5], s[8:15] dmask:0x1 lwe ; encoding: [0x00,0x01,0x03,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a5, v[2:5], s[8:15] dmask:0x1 lwe
+
+// GFX90A: image_load a5, v[2:5], s[8:15] dmask:0x1 da ; encoding: [0x00,0x41,0x01,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a5, v[2:5], s[8:15] dmask:0x1 da
+
+// GFX90A: image_load a5, v[2:5], s[8:15] dmask:0x1 d16 ; encoding: [0x00,0x01,0x01,0xf0,0x02,0x05,0x02,0x80]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_load a5, v[2:5], s[8:15] dmask:0x1 d16
+
+// GFX90A: image_store a1, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x21,0xf0,0x02,0x01,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a1, v[2:5], s[12:19] dmask:0x1 unorm
+
+// GFX90A: image_store a252, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x21,0xf0,0x02,0xfc,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a252, v[2:5], s[12:19] dmask:0x1 unorm
+
+// GFX90A: image_store a1, v[252:255], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x21,0xf0,0xfc,0x01,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a1, v[252:255], s[12:19] dmask:0x1 unorm
+
+// GFX90A: image_store a1, v[2:5], s[16:23] dmask:0x1 unorm ; encoding: [0x00,0x11,0x21,0xf0,0x02,0x01,0x04,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a1, v[2:5], s[16:23] dmask:0x1 unorm
+
+// GFX90A: image_store a1, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x21,0xf0,0x02,0x01,0x17,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a1, v[2:5], s[92:99] dmask:0x1 unorm
+
+// GFX90A: image_store a1, v[2:5], s[12:19] dmask:0x2 unorm ; encoding: [0x00,0x12,0x21,0xf0,0x02,0x01,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a1, v[2:5], s[12:19] dmask:0x2 unorm
+
+// GFX90A: image_store a[2:3], v[2:5], s[12:19] dmask:0x3 unorm ; encoding: [0x00,0x13,0x21,0xf0,0x02,0x02,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a[2:3], v[2:5], s[12:19] dmask:0x3 unorm
+
+// GFX90A: image_store a1, v[2:5], s[12:19] dmask:0x4 unorm ; encoding: [0x00,0x14,0x21,0xf0,0x02,0x01,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a1, v[2:5], s[12:19] dmask:0x4 unorm
+
+// GFX90A: image_store a[2:3], v[2:5], s[12:19] dmask:0x5 unorm ; encoding: [0x00,0x15,0x21,0xf0,0x02,0x02,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a[2:3], v[2:5], s[12:19] dmask:0x5 unorm
+
+// GFX90A: image_store a[2:3], v[2:5], s[12:19] dmask:0x6 unorm ; encoding: [0x00,0x16,0x21,0xf0,0x02,0x02,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a[2:3], v[2:5], s[12:19] dmask:0x6 unorm
+
+// GFX90A: image_store a[2:4], v[2:5], s[12:19] dmask:0x7 unorm ; encoding: [0x00,0x17,0x21,0xf0,0x02,0x02,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a[2:4], v[2:5], s[12:19] dmask:0x7 unorm
+
+// GFX90A: image_store a1, v[2:5], s[12:19] dmask:0x8 unorm ; encoding: [0x00,0x18,0x21,0xf0,0x02,0x01,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a1, v[2:5], s[12:19] dmask:0x8 unorm
+
+// GFX90A: image_store a[2:3], v[2:5], s[12:19] dmask:0x9 unorm ; encoding: [0x00,0x19,0x21,0xf0,0x02,0x02,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a[2:3], v[2:5], s[12:19] dmask:0x9 unorm
+
+// GFX90A: image_store a[2:3], v[2:5], s[12:19] dmask:0xa unorm ; encoding: [0x00,0x1a,0x21,0xf0,0x02,0x02,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a[2:3], v[2:5], s[12:19] dmask:0xa unorm
+
+// GFX90A: image_store a[2:4], v[2:5], s[12:19] dmask:0xb unorm ; encoding: [0x00,0x1b,0x21,0xf0,0x02,0x02,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a[2:4], v[2:5], s[12:19] dmask:0xb unorm
+
+// GFX90A: image_store a[2:3], v[2:5], s[12:19] dmask:0xc unorm ; encoding: [0x00,0x1c,0x21,0xf0,0x02,0x02,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a[2:3], v[2:5], s[12:19] dmask:0xc unorm
+
+// GFX90A: image_store a[2:4], v[2:5], s[12:19] dmask:0xd unorm ; encoding: [0x00,0x1d,0x21,0xf0,0x02,0x02,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a[2:4], v[2:5], s[12:19] dmask:0xd unorm
+
+// GFX90A: image_store a[2:4], v[2:5], s[12:19] dmask:0xe unorm ; encoding: [0x00,0x1e,0x21,0xf0,0x02,0x02,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a[2:4], v[2:5], s[12:19] dmask:0xe unorm
+
+// GFX90A: image_store a[2:5], v[2:5], s[12:19] dmask:0xf unorm ; encoding: [0x00,0x1f,0x21,0xf0,0x02,0x02,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a[2:5], v[2:5], s[12:19] dmask:0xf unorm
+
+// GFX90A: image_store a1, v[2:5], s[12:19] unorm ; encoding: [0x00,0x10,0x21,0xf0,0x02,0x01,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a1, v[2:5], s[12:19] unorm
+
+// GFX90A: image_store a1, v[2:5], s[12:19] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x21,0xf0,0x02,0x01,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a1, v[2:5], s[12:19] dmask:0x1 unorm glc
+
+// GFX90A: image_store a1, v[2:5], s[12:19] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x21,0xf2,0x02,0x01,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a1, v[2:5], s[12:19] dmask:0x1 unorm slc
+
+// GFX90A: image_store a1, v[2:5], s[12:19] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x23,0xf0,0x02,0x01,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a1, v[2:5], s[12:19] dmask:0x1 unorm lwe
+
+// GFX90A: image_store a1, v[2:5], s[12:19] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x21,0xf0,0x02,0x01,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a1, v[2:5], s[12:19] dmask:0x1 unorm da
+
+// GFX90A: image_store a1, v[2:5], s[12:19] dmask:0x1 unorm d16 ; encoding: [0x00,0x11,0x21,0xf0,0x02,0x01,0x03,0x80]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_store a1, v[2:5], s[12:19] dmask:0x1 unorm d16
+
+// GFX90A: image_atomic_swap a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_swap a5, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_swap a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0x02,0xfc,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_swap a252, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_swap a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0xfc,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_swap a5, v[252:255], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_swap a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0x02,0x05,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_swap a5, v[2:5], s[12:19] dmask:0x1 unorm
+
+// GFX90A: image_atomic_swap a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0x02,0x05,0x17,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_swap a5, v[2:5], s[92:99] dmask:0x1 unorm
+
+// GFX90A: image_atomic_swap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x41,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_swap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm
+
+// GFX90A: image_atomic_swap a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x41,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_swap a5, v[2:5], s[8:15] dmask:0x1 unorm glc
+
+// GFX90A: image_atomic_swap a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x41,0xf2,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_swap a5, v[2:5], s[8:15] dmask:0x1 unorm slc
+
+// GFX90A: image_atomic_swap a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x43,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_swap a5, v[2:5], s[8:15] dmask:0x1 unorm lwe
+
+// GFX90A: image_atomic_swap a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x41,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_swap a5, v[2:5], s[8:15] dmask:0x1 unorm da
+
+// GFX90A: image_atomic_cmpswap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_cmpswap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm
+
+// GFX90A: image_atomic_cmpswap a[252:253], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0x02,0xfc,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_cmpswap a[252:253], v[2:5], s[8:15] dmask:0x3 unorm
+
+// GFX90A: image_atomic_cmpswap a[6:7], v[252:255], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0xfc,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_cmpswap a[6:7], v[252:255], s[8:15] dmask:0x3 unorm
+
+// GFX90A: image_atomic_cmpswap a[6:7], v[2:5], s[12:19] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0x02,0x06,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_cmpswap a[6:7], v[2:5], s[12:19] dmask:0x3 unorm
+
+// GFX90A: image_atomic_cmpswap a[6:7], v[2:5], s[92:99] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0x02,0x06,0x17,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_cmpswap a[6:7], v[2:5], s[92:99] dmask:0x3 unorm
+
+// GFX90A: image_atomic_cmpswap a[6:9], v[2:5], s[8:15] dmask:0xf unorm ; encoding: [0x00,0x1f,0x45,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_cmpswap a[6:9], v[2:5], s[8:15] dmask:0xf unorm
+
+// GFX90A: image_atomic_cmpswap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x45,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_cmpswap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm glc
+
+// GFX90A: image_atomic_cmpswap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm slc ; encoding: [0x00,0x13,0x45,0xf2,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_cmpswap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm slc
+
+// GFX90A: image_atomic_cmpswap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm lwe ; encoding: [0x00,0x13,0x47,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_cmpswap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm lwe
+
+// GFX90A: image_atomic_cmpswap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm da ; encoding: [0x00,0x53,0x45,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_cmpswap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm da
+
+// GFX90A: image_atomic_add a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_add a5, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_add a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0x02,0xfc,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_add a252, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_add a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0xfc,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_add a5, v[252:255], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_add a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0x02,0x05,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_add a5, v[2:5], s[12:19] dmask:0x1 unorm
+
+// GFX90A: image_atomic_add a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0x02,0x05,0x17,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_add a5, v[2:5], s[92:99] dmask:0x1 unorm
+
+// GFX90A: image_atomic_add a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x49,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_add a[6:7], v[2:5], s[8:15] dmask:0x3 unorm
+
+// GFX90A: image_atomic_add a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x49,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_add a5, v[2:5], s[8:15] dmask:0x1 unorm glc
+
+// GFX90A: image_atomic_add a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x49,0xf2,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_add a5, v[2:5], s[8:15] dmask:0x1 unorm slc
+
+// GFX90A: image_atomic_add a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x4b,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_add a5, v[2:5], s[8:15] dmask:0x1 unorm lwe
+
+// GFX90A: image_atomic_add a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x49,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_add a5, v[2:5], s[8:15] dmask:0x1 unorm da
+
+// GFX90A: image_atomic_sub a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_sub a5, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_sub a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0x02,0xfc,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_sub a252, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_sub a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0xfc,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_sub a5, v[252:255], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_sub a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0x02,0x05,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_sub a5, v[2:5], s[12:19] dmask:0x1 unorm
+
+// GFX90A: image_atomic_sub a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0x02,0x05,0x17,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_sub a5, v[2:5], s[92:99] dmask:0x1 unorm
+
+// GFX90A: image_atomic_sub a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x4d,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_sub a[6:7], v[2:5], s[8:15] dmask:0x3 unorm
+
+// GFX90A: image_atomic_sub a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x4d,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_sub a5, v[2:5], s[8:15] dmask:0x1 unorm glc
+
+// GFX90A: image_atomic_sub a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x4d,0xf2,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_sub a5, v[2:5], s[8:15] dmask:0x1 unorm slc
+
+// GFX90A: image_atomic_sub a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x4f,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_sub a5, v[2:5], s[8:15] dmask:0x1 unorm lwe
+
+// GFX90A: image_atomic_sub a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x4d,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_sub a5, v[2:5], s[8:15] dmask:0x1 unorm da
+
+// GFX90A: image_atomic_smin a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_smin a5, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_smin a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0x02,0xfc,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_smin a252, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_smin a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0xfc,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_smin a5, v[252:255], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_smin a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0x02,0x05,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_smin a5, v[2:5], s[12:19] dmask:0x1 unorm
+
+// GFX90A: image_atomic_smin a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0x02,0x05,0x17,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_smin a5, v[2:5], s[92:99] dmask:0x1 unorm
+
+// GFX90A: image_atomic_smin a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x51,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_smin a[6:7], v[2:5], s[8:15] dmask:0x3 unorm
+
+// GFX90A: image_atomic_smin a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x51,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_smin a5, v[2:5], s[8:15] dmask:0x1 unorm glc
+
+// GFX90A: image_atomic_smin a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x51,0xf2,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_smin a5, v[2:5], s[8:15] dmask:0x1 unorm slc
+
+// GFX90A: image_atomic_smin a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x53,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_smin a5, v[2:5], s[8:15] dmask:0x1 unorm lwe
+
+// GFX90A: image_atomic_smin a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x51,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_smin a5, v[2:5], s[8:15] dmask:0x1 unorm da
+
+// GFX90A: image_atomic_umin a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_umin a5, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_umin a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0x02,0xfc,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_umin a252, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_umin a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0xfc,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_umin a5, v[252:255], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_umin a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0x02,0x05,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_umin a5, v[2:5], s[12:19] dmask:0x1 unorm
+
+// GFX90A: image_atomic_umin a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0x02,0x05,0x17,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_umin a5, v[2:5], s[92:99] dmask:0x1 unorm
+
+// GFX90A: image_atomic_umin a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x55,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_umin a[6:7], v[2:5], s[8:15] dmask:0x3 unorm
+
+// GFX90A: image_atomic_umin a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x55,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_umin a5, v[2:5], s[8:15] dmask:0x1 unorm glc
+
+// GFX90A: image_atomic_umin a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x55,0xf2,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_umin a5, v[2:5], s[8:15] dmask:0x1 unorm slc
+
+// GFX90A: image_atomic_umin a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x57,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_umin a5, v[2:5], s[8:15] dmask:0x1 unorm lwe
+
+// GFX90A: image_atomic_umin a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x55,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_umin a5, v[2:5], s[8:15] dmask:0x1 unorm da
+
+// GFX90A: image_atomic_smax a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_smax a5, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_smax a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0x02,0xfc,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_smax a252, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_smax a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0xfc,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_smax a5, v[252:255], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_smax a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0x02,0x05,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_smax a5, v[2:5], s[12:19] dmask:0x1 unorm
+
+// GFX90A: image_atomic_smax a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0x02,0x05,0x17,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_smax a5, v[2:5], s[92:99] dmask:0x1 unorm
+
+// GFX90A: image_atomic_smax a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x59,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_smax a[6:7], v[2:5], s[8:15] dmask:0x3 unorm
+
+// GFX90A: image_atomic_smax a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x59,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_smax a5, v[2:5], s[8:15] dmask:0x1 unorm glc
+
+// GFX90A: image_atomic_smax a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x59,0xf2,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_smax a5, v[2:5], s[8:15] dmask:0x1 unorm slc
+
+// GFX90A: image_atomic_smax a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x5b,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_smax a5, v[2:5], s[8:15] dmask:0x1 unorm lwe
+
+// GFX90A: image_atomic_smax a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x59,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_smax a5, v[2:5], s[8:15] dmask:0x1 unorm da
+
+// GFX90A: image_atomic_umax a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_umax a5, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_umax a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0x02,0xfc,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_umax a252, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_umax a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0xfc,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_umax a5, v[252:255], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_umax a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0x02,0x05,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_umax a5, v[2:5], s[12:19] dmask:0x1 unorm
+
+// GFX90A: image_atomic_umax a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0x02,0x05,0x17,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_umax a5, v[2:5], s[92:99] dmask:0x1 unorm
+
+// GFX90A: image_atomic_umax a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x5d,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_umax a[6:7], v[2:5], s[8:15] dmask:0x3 unorm
+
+// GFX90A: image_atomic_umax a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x5d,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_umax a5, v[2:5], s[8:15] dmask:0x1 unorm glc
+
+// GFX90A: image_atomic_umax a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x5d,0xf2,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_umax a5, v[2:5], s[8:15] dmask:0x1 unorm slc
+
+// GFX90A: image_atomic_umax a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x5f,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_umax a5, v[2:5], s[8:15] dmask:0x1 unorm lwe
+
+// GFX90A: image_atomic_umax a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x5d,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_umax a5, v[2:5], s[8:15] dmask:0x1 unorm da
+
+// GFX90A: image_atomic_and a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_and a5, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_and a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0x02,0xfc,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_and a252, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_and a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0xfc,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_and a5, v[252:255], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_and a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0x02,0x05,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_and a5, v[2:5], s[12:19] dmask:0x1 unorm
+
+// GFX90A: image_atomic_and a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0x02,0x05,0x17,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_and a5, v[2:5], s[92:99] dmask:0x1 unorm
+
+// GFX90A: image_atomic_and a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x61,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_and a[6:7], v[2:5], s[8:15] dmask:0x3 unorm
+
+// GFX90A: image_atomic_and a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x61,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_and a5, v[2:5], s[8:15] dmask:0x1 unorm glc
+
+// GFX90A: image_atomic_and a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x61,0xf2,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_and a5, v[2:5], s[8:15] dmask:0x1 unorm slc
+
+// GFX90A: image_atomic_and a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x63,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_and a5, v[2:5], s[8:15] dmask:0x1 unorm lwe
+
+// GFX90A: image_atomic_and a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x61,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_and a5, v[2:5], s[8:15] dmask:0x1 unorm da
+
+// GFX90A: image_atomic_or a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_or a5, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_or a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0x02,0xfc,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_or a252, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_or a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0xfc,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_or a5, v[252:255], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_or a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0x02,0x05,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_or a5, v[2:5], s[12:19] dmask:0x1 unorm
+
+// GFX90A: image_atomic_or a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0x02,0x05,0x17,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_or a5, v[2:5], s[92:99] dmask:0x1 unorm
+
+// GFX90A: image_atomic_or a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x65,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_or a[6:7], v[2:5], s[8:15] dmask:0x3 unorm
+
+// GFX90A: image_atomic_or a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x65,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_or a5, v[2:5], s[8:15] dmask:0x1 unorm glc
+
+// GFX90A: image_atomic_or a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x65,0xf2,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_or a5, v[2:5], s[8:15] dmask:0x1 unorm slc
+
+// GFX90A: image_atomic_or a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x67,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_or a5, v[2:5], s[8:15] dmask:0x1 unorm lwe
+
+// GFX90A: image_atomic_or a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x65,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_or a5, v[2:5], s[8:15] dmask:0x1 unorm da
+
+// GFX90A: image_atomic_xor a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_xor a5, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_xor a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0x02,0xfc,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_xor a252, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_xor a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0xfc,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_xor a5, v[252:255], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_xor a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0x02,0x05,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_xor a5, v[2:5], s[12:19] dmask:0x1 unorm
+
+// GFX90A: image_atomic_xor a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0x02,0x05,0x17,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_xor a5, v[2:5], s[92:99] dmask:0x1 unorm
+
+// GFX90A: image_atomic_xor a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x69,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_xor a[6:7], v[2:5], s[8:15] dmask:0x3 unorm
+
+// GFX90A: image_atomic_xor a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x69,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_xor a5, v[2:5], s[8:15] dmask:0x1 unorm glc
+
+// GFX90A: image_atomic_xor a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x69,0xf2,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_xor a5, v[2:5], s[8:15] dmask:0x1 unorm slc
+
+// GFX90A: image_atomic_xor a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x6b,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_xor a5, v[2:5], s[8:15] dmask:0x1 unorm lwe
+
+// GFX90A: image_atomic_xor a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x69,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_xor a5, v[2:5], s[8:15] dmask:0x1 unorm da
+
+// GFX90A: image_atomic_inc a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_inc a5, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_inc a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0x02,0xfc,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_inc a252, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_inc a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0xfc,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_inc a5, v[252:255], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_inc a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0x02,0x05,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_inc a5, v[2:5], s[12:19] dmask:0x1 unorm
+
+// GFX90A: image_atomic_inc a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0x02,0x05,0x17,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_inc a5, v[2:5], s[92:99] dmask:0x1 unorm
+
+// GFX90A: image_atomic_inc a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x6d,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_inc a[6:7], v[2:5], s[8:15] dmask:0x3 unorm
+
+// GFX90A: image_atomic_inc a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x6d,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_inc a5, v[2:5], s[8:15] dmask:0x1 unorm glc
+
+// GFX90A: image_atomic_inc a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x6d,0xf2,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_inc a5, v[2:5], s[8:15] dmask:0x1 unorm slc
+
+// GFX90A: image_atomic_inc a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x6f,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_inc a5, v[2:5], s[8:15] dmask:0x1 unorm lwe
+
+// GFX90A: image_atomic_inc a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x6d,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_inc a5, v[2:5], s[8:15] dmask:0x1 unorm da
+
+// GFX90A: image_atomic_dec a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_dec a5, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_dec a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0x02,0xfc,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_dec a252, v[2:5], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_dec a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0xfc,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_dec a5, v[252:255], s[8:15] dmask:0x1 unorm
+
+// GFX90A: image_atomic_dec a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0x02,0x05,0x03,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_dec a5, v[2:5], s[12:19] dmask:0x1 unorm
+
+// GFX90A: image_atomic_dec a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0x02,0x05,0x17,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_dec a5, v[2:5], s[92:99] dmask:0x1 unorm
+
+// GFX90A: image_atomic_dec a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x71,0xf0,0x02,0x06,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_dec a[6:7], v[2:5], s[8:15] dmask:0x3 unorm
+
+// GFX90A: image_atomic_dec a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x71,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_dec a5, v[2:5], s[8:15] dmask:0x1 unorm glc
+
+// GFX90A: image_atomic_dec a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x71,0xf2,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_dec a5, v[2:5], s[8:15] dmask:0x1 unorm slc
+
+// GFX90A: image_atomic_dec a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x73,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_dec a5, v[2:5], s[8:15] dmask:0x1 unorm lwe
+
+// GFX90A: image_atomic_dec a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x71,0xf0,0x02,0x05,0x02,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_atomic_dec a5, v[2:5], s[8:15] dmask:0x1 unorm da
+
+// GFX90A: image_sample a5, v[0:3], s[8:15], s[12:15] dmask:0x1 ; encoding: [0x00,0x01,0x81,0xf0,0x00,0x05,0x62,0x00]
+// NOT-GFX90A: error: operands are not valid for this GPU or mode
+image_sample a5, v[0:3], s[8:15], s[12:15] dmask:0x1

diff  --git a/llvm/test/MC/AMDGPU/hsa-diag-v3.s b/llvm/test/MC/AMDGPU/hsa-diag-v3.s
index a99cd937f1b5..e255e6eb2050 100644
--- a/llvm/test/MC/AMDGPU/hsa-diag-v3.s
+++ b/llvm/test/MC/AMDGPU/hsa-diag-v3.s
@@ -1,6 +1,7 @@
 // RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX8,NONGFX10,AMDHSA
 // RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX10,AMDHSA
 // RUN: not llvm-mc -triple amdgcn-amd- -mcpu=gfx803 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,NONAMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GFX90A,NONGFX10,AMDHSA,ALL
 
 .text
 
@@ -18,7 +19,6 @@
 .end_amdhsa_kernel
 
 // GCN-LABEL: warning: test_amdhsa_kernel_empty
-// AMDHSA-NOT: error: unknown directive
 // NONAMDHSA: error: unknown directive
 .warning "test_amdhsa_kernel_empty"
 .amdhsa_kernel test_amdhsa_kernel_empty
@@ -72,7 +72,88 @@
   .amdhsa_next_free_vgpr 0
 .end_amdhsa_kernel
 
-// GCN-LABEL: warning: test_amdhsa_wavefront_size32
+// ALL-LABEL: warning: test_amdhsa_accum_offset
+// NONGFX9A: error: directive requires gfx90a+
+// GFX90A: error: .amdhsa_next_free_vgpr directive is required
+// NONAMDHSA: error: unknown directive
+.warning "test_amdhsa_accum_offset"
+.amdhsa_kernel test_amdhsa_accum_offset
+  .amdhsa_accum_offset 4
+.end_amdhsa_kernel
+
+// ALL-LABEL: warning: test_amdhsa_accum_offset_missing
+// NONGFX9A: error: directive requires gfx90a+
+// GFX90A: error: .amdhsa_accum_offset directive is required
+// NONAMDHSA: error: unknown directive
+.warning "test_amdhsa_accum_offset_missing"
+.amdhsa_kernel test_amdhsa_accum_offset_missing
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_next_free_vgpr 0
+.end_amdhsa_kernel
+
+// ALL-LABEL: warning: test_amdhsa_accum_offset_invalid0
+// NONGFX9A: error: directive requires gfx90a+
+// GFX90A: error: accum_offset should be in range [4..256] in increments of 4
+// NONAMDHSA: error: unknown directive
+.warning "test_amdhsa_accum_offset_invalid0"
+.amdhsa_kernel test_amdhsa_accum_offset_invalid0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_accum_offset 0
+.end_amdhsa_kernel
+
+// ALL-LABEL: warning: test_amdhsa_accum_offset_invalid5
+// NONGFX9A: error: directive requires gfx90a+
+// GFX90A: error: accum_offset should be in range [4..256] in increments of 4
+// NONAMDHSA: error: unknown directive
+.warning "test_amdhsa_accum_offset_invalid5"
+.amdhsa_kernel test_amdhsa_accum_offset_invalid5
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_accum_offset 5
+.end_amdhsa_kernel
+
+// ALL-LABEL: warning: test_amdhsa_accum_offset_invalid257
+// NONGFX9A: error: directive requires gfx90a+
+// GFX90A: error: accum_offset should be in range [4..256] in increments of 4
+// NONAMDHSA: error: unknown directive
+.warning "test_amdhsa_accum_offset_invalid257"
+.amdhsa_kernel test_amdhsa_accum_offset_invalid257
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_accum_offset 257
+.end_amdhsa_kernel
+
+// ALL-LABEL: warning: test_amdhsa_accum_offset_invalid8
+// NONGFX9A: error: directive requires gfx90a+
+// GFX90A: error: accum_offset exceeds total VGPR allocation
+// NONAMDHSA: error: unknown directive
+.warning "test_amdhsa_accum_offset_invalid8"
+.amdhsa_kernel test_amdhsa_accum_offset_invalid8
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_accum_offset 8
+.end_amdhsa_kernel
+
+// ALL-LABEL: warning: test_amdhsa_tg_split
+// NONGFX90A: error: directive requires gfx90a+
+// GFX90A: error: .amdhsa_next_free_vgpr directive is required
+// NONAMDHSA: error: unknown directive
+.warning "test_amdhsa_tg_split"
+.amdhsa_kernel test_amdhsa_tg_split
+  .amdhsa_tg_split 1
+.end_amdhsa_kernel
+
+// ALL-LABEL: warning: test_amdhsa_tg_split_invalid
+// NONGFX90A: error: directive requires gfx90a+
+// GFX90A: error: value out of range
+// NONAMDHSA: error: unknown directive
+.warning "test_amdhsa_tg_split_invalid"
+.amdhsa_kernel test_amdhsa_tg_split_invalid
+  .amdhsa_tg_split 5
+.end_amdhsa_kernel
+
+// ALL-LABEL: warning: test_amdhsa_wavefront_size32
 // NONGFX10: error: directive requires gfx10+
 // GFX10: error: .amdhsa_next_free_vgpr directive is required
 // NONAMDHSA: error: unknown directive

diff  --git a/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s
new file mode 100644
index 000000000000..68c42c046f65
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s
@@ -0,0 +1,179 @@
+// RUN: llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack -filetype=obj < %s > %t
+// RUN: llvm-readobj -elf-output-style=GNU -sections -symbols -relocations %t | FileCheck --check-prefix=READOBJ %s
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// big endian not supported
+// XFAIL: powerpc-, powerpc64-, s390x, mips-, mips64-, sparc
+
+// READOBJ: Section Headers
+// READOBJ: .text   PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9]+}} AX {{[0-9]+}} {{[0-9]+}} 256
+// READOBJ: .rodata PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}}        000080 {{[0-9]+}}  A {{[0-9]+}} {{[0-9]+}} 64
+
+// READOBJ: Relocation section '.rela.rodata' at offset
+// READOBJ: 0000000000000010 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 10
+// READOBJ: 0000000000000050 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 110
+
+// READOBJ: Symbol table '.symtab' contains {{[0-9]+}} entries:
+// READOBJ-DAG: {{[0-9]+}}: 0000000000000100  0 FUNC    LOCAL  PROTECTED 2 complete
+// READOBJ-DAG: {{[0-9]+}}: 0000000000000040 64 OBJECT  LOCAL  DEFAULT   3 complete.kd
+// READOBJ-DAG: {{[0-9]+}}: 0000000000000000  0 FUNC    LOCAL  PROTECTED 2 minimal
+// READOBJ-DAG: {{[0-9]+}}: 0000000000000000 64 OBJECT  LOCAL  DEFAULT   3 minimal.kd
+
+// OBJDUMP: Contents of section .rodata
+// Note, relocation for KERNEL_CODE_ENTRY_BYTE_OFFSET is not resolved here.
+// minimal
+// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 0000ac00 80000000 00000000 00000000
+// complete
+// OBJDUMP-NEXT: 0040 01000000 01000000 00000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000100
+// OBJDUMP-NEXT: 0070 c1500104 1f0f007f 7f000000 00000000
+
+.text
+// ASM: .text
+
+.amdgcn_target "amdgcn-amd-amdhsa--gfx90a+xnack"
+// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx90a+xnack"
+
+.p2align 8
+.type minimal, at function
+minimal:
+  s_endpgm
+
+.p2align 8
+.type complete, at function
+complete:
+  s_endpgm
+
+.rodata
+// ASM: .rodata
+
+// Test that only specifying required directives is allowed, and that defaulted
+// values are omitted.
+.p2align 6
+.amdhsa_kernel minimal
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel minimal
+// ASM: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_accum_offset 4
+// ASM: .amdhsa_tg_split 0
+// ASM: .end_amdhsa_kernel
+
+// Test that we can specify all available directives with non-default values.
+.p2align 6
+.amdhsa_kernel complete
+  .amdhsa_group_segment_fixed_size 1
+  .amdhsa_private_segment_fixed_size 1
+  .amdhsa_user_sgpr_private_segment_buffer 1
+  .amdhsa_user_sgpr_dispatch_ptr 1
+  .amdhsa_user_sgpr_queue_ptr 1
+  .amdhsa_user_sgpr_kernarg_segment_ptr 1
+  .amdhsa_user_sgpr_dispatch_id 1
+  .amdhsa_user_sgpr_flat_scratch_init 1
+  .amdhsa_user_sgpr_private_segment_size 1
+  .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+  .amdhsa_system_sgpr_workgroup_id_x 0
+  .amdhsa_system_sgpr_workgroup_id_y 1
+  .amdhsa_system_sgpr_workgroup_id_z 1
+  .amdhsa_system_sgpr_workgroup_info 1
+  .amdhsa_system_vgpr_workitem_id 1
+  .amdhsa_next_free_vgpr 9
+  .amdhsa_next_free_sgpr 27
+  .amdhsa_accum_offset 4
+  .amdhsa_reserve_vcc 0
+  .amdhsa_reserve_flat_scratch 0
+  .amdhsa_float_round_mode_32 1
+  .amdhsa_float_round_mode_16_64 1
+  .amdhsa_float_denorm_mode_32 1
+  .amdhsa_float_denorm_mode_16_64 0
+  .amdhsa_dx10_clamp 0
+  .amdhsa_ieee_mode 0
+  .amdhsa_fp16_overflow 1
+  .amdhsa_tg_split 1
+  .amdhsa_exception_fp_ieee_invalid_op 1
+  .amdhsa_exception_fp_denorm_src 1
+  .amdhsa_exception_fp_ieee_div_zero 1
+  .amdhsa_exception_fp_ieee_overflow 1
+  .amdhsa_exception_fp_ieee_underflow 1
+  .amdhsa_exception_fp_ieee_inexact 1
+  .amdhsa_exception_int_div_zero 1
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel complete
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 1
+// ASM-NEXT: .amdhsa_next_free_vgpr 9
+// ASM-NEXT: .amdhsa_next_free_sgpr 27
+// ASM-NEXT: .amdhsa_accum_offset 4
+// ASM-NEXT: .amdhsa_reserve_vcc 0
+// ASM-NEXT: .amdhsa_reserve_flat_scratch 0
+// ASM-NEXT: .amdhsa_float_round_mode_32 1
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 0
+// ASM-NEXT: .amdhsa_dx10_clamp 0
+// ASM-NEXT: .amdhsa_ieee_mode 0
+// ASM-NEXT: .amdhsa_fp16_overflow 1
+// ASM-NEXT: .amdhsa_tg_split 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
+// ASM-NEXT: .amdhsa_exception_int_div_zero 1
+// ASM-NEXT: .end_amdhsa_kernel
+
+.section .foo
+
+.byte .amdgcn.gfx_generation_number
+// ASM: .byte 9
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v7, s10
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 8
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 11
+
+.set .amdgcn.next_free_vgpr, 0
+.set .amdgcn.next_free_sgpr, 0
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v16, s3
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 17
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 4

diff  --git a/llvm/test/MC/AMDGPU/mai-gfx90a.s b/llvm/test/MC/AMDGPU/mai-gfx90a.s
new file mode 100644
index 000000000000..14cec6a05730
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/mai-gfx90a.s
@@ -0,0 +1,2518 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -show-encoding %s | FileCheck -check-prefix=GFX90A %s
+
+v_accvgpr_read_b32 v2, a0
+// GFX90A: v_accvgpr_read_b32 v2, a0       ; encoding: [0x02,0x40,0xd8,0xd3,0x00,0x01,0x00,0x18]
+
+v_accvgpr_read_b32 v2, a1
+// GFX90A: v_accvgpr_read_b32 v2, a1       ; encoding: [0x02,0x40,0xd8,0xd3,0x01,0x01,0x00,0x18]
+
+v_accvgpr_read_b32 v2, a255
+// GFX90A: v_accvgpr_read_b32 v2, a255     ; encoding: [0x02,0x40,0xd8,0xd3,0xff,0x01,0x00,0x18]
+
+v_accvgpr_read v2, a10
+// GFX90A: v_accvgpr_read_b32 v2, a10      ; encoding: [0x02,0x40,0xd8,0xd3,0x0a,0x01,0x00,0x18]
+
+v_accvgpr_write_b32 a2, -2.0
+// GFX90A: v_accvgpr_write_b32 a2, -2.0    ; encoding: [0x02,0x40,0xd9,0xd3,0xf5,0x00,0x00,0x18]
+
+v_accvgpr_write_b32 a2, -2
+// GFX90A: v_accvgpr_write_b32 a2, -2      ; encoding: [0x02,0x40,0xd9,0xd3,0xc2,0x00,0x00,0x18]
+
+v_accvgpr_write_b32 a2, v1
+// GFX90A: v_accvgpr_write_b32 a2, v1      ; encoding: [0x02,0x40,0xd9,0xd3,0x01,0x01,0x00,0x18]
+
+v_accvgpr_write a2, v255
+// GFX90A: v_accvgpr_write_b32 a2, v255    ; encoding: [0x02,0x40,0xd9,0xd3,0xff,0x01,0x00,0x18]
+
+v_accvgpr_mov_b32 a1, a2
+// GFX90A: v_accvgpr_mov_b32 a1, a2        ; encoding: [0x02,0xa5,0x02,0x7e]
+
+v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[2:33]
+// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[2:33] ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_f32_32x32x1f32 a[0:31], v0, a1, a[2:33]
+// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, a1, a[2:33] ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_f32_32x32x1f32 a[0:31], v0, a1, a[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_f32_32x32x1f32 a[0:31], a0, v1, a[2:33]
+// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, v1, a[2:33] ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_f32_32x32x1f32 a[0:31], a0, v1, a[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_f32_32x32x1f32 a[0:31], a0, a1, a[2:33]
+// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, a1, a[2:33] ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_f32_32x32x1f32 a[0:31], a0, a1, a[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_f32_32x32x1f32 v[0:31], v0, v1, v[2:33]
+// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, v1, v[2:33] ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_f32_32x32x1f32 v[0:31], v0, v1, v[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_f32_32x32x1f32 v[0:31], v0, a1, v[2:33]
+// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, a1, v[2:33] ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_f32_32x32x1f32 v[0:31], v0, a1, v[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_f32_32x32x1f32 v[0:31], a0, v1, v[2:33]
+// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, v1, v[2:33] ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_f32_32x32x1f32 v[0:31], a0, v1, v[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_f32_32x32x1f32 v[0:31], a0, a1, v[2:33]
+// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, a1, v[2:33] ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_f32_32x32x1f32 v[0:31], a0, a1, v[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_f32_32x32x1f32 a[0:31], v0, v1, -2.0
+// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, -2.0 ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x03]
+
+v_mfma_f32_32x32x1f32 v[0:31], v0, v1, -2.0
+// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, v1, -2.0 ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x03]
+
+v_mfma_f32_32x32x1f32 a[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xe3]
+
+v_mfma_f32_32x32x1f32 v[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xe3]
+
+v_mfma_f32_32x32x1f32 a[0:31], v0, a1, -2.0
+// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, a1, -2.0 ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x13]
+
+v_mfma_f32_32x32x1f32 v[0:31], v0, a1, -2.0
+// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, a1, -2.0 ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x13]
+
+v_mfma_f32_32x32x1f32 a[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xf3]
+
+v_mfma_f32_32x32x1f32 v[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xf3]
+
+v_mfma_f32_32x32x1f32 a[0:31], a0, v1, -2.0
+// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, v1, -2.0 ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x0b]
+
+v_mfma_f32_32x32x1f32 v[0:31], a0, v1, -2.0
+// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, v1, -2.0 ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x0b]
+
+v_mfma_f32_32x32x1f32 a[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xeb]
+
+v_mfma_f32_32x32x1f32 v[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xeb]
+
+v_mfma_f32_32x32x1f32 a[0:31], a0, a1, -2.0
+// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, a1, -2.0 ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x1b]
+
+v_mfma_f32_32x32x1f32 v[0:31], a0, a1, -2.0
+// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, a1, -2.0 ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x1b]
+
+v_mfma_f32_32x32x1f32 a[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xfb]
+
+v_mfma_f32_32x32x1f32 v[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xfb]
+
+v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[2:17]
+// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_f32_16x16x1f32 a[0:15], v0, a1, a[2:17]
+// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_f32_16x16x1f32 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_f32_16x16x1f32 a[0:15], a0, v1, a[2:17]
+// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_f32_16x16x1f32 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_f32_16x16x1f32 a[0:15], a0, a1, a[2:17]
+// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_f32_16x16x1f32 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_f32_16x16x1f32 v[0:15], v0, v1, v[2:17]
+// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_f32_16x16x1f32 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_f32_16x16x1f32 v[0:15], v0, a1, v[2:17]
+// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_f32_16x16x1f32 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_f32_16x16x1f32 v[0:15], a0, v1, v[2:17]
+// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_f32_16x16x1f32 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_f32_16x16x1f32 v[0:15], a0, a1, v[2:17]
+// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_f32_16x16x1f32 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_f32_16x16x1f32 a[0:15], v0, v1, -2.0
+// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, -2.0 ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x03]
+
+v_mfma_f32_16x16x1f32 v[0:15], v0, v1, -2.0
+// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, v1, -2.0 ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x03]
+
+v_mfma_f32_16x16x1f32 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xe3]
+
+v_mfma_f32_16x16x1f32 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xe3]
+
+v_mfma_f32_16x16x1f32 a[0:15], v0, a1, -2.0
+// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, a1, -2.0 ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x13]
+
+v_mfma_f32_16x16x1f32 v[0:15], v0, a1, -2.0
+// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, a1, -2.0 ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x13]
+
+v_mfma_f32_16x16x1f32 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xf3]
+
+v_mfma_f32_16x16x1f32 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xf3]
+
+v_mfma_f32_16x16x1f32 a[0:15], a0, v1, -2.0
+// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, v1, -2.0 ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x0b]
+
+v_mfma_f32_16x16x1f32 v[0:15], a0, v1, -2.0
+// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, v1, -2.0 ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x0b]
+
+v_mfma_f32_16x16x1f32 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xeb]
+
+v_mfma_f32_16x16x1f32 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xeb]
+
+v_mfma_f32_16x16x1f32 a[0:15], a0, a1, -2.0
+// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, a1, -2.0 ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x1b]
+
+v_mfma_f32_16x16x1f32 v[0:15], a0, a1, -2.0
+// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, a1, -2.0 ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x1b]
+
+v_mfma_f32_16x16x1f32 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xfb]
+
+v_mfma_f32_16x16x1f32 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xfb]
+
+v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[2:5]
+// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_f32_4x4x1f32 a[0:3], v0, a1, a[2:5]
+// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_f32_4x4x1f32 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_f32_4x4x1f32 a[0:3], a0, v1, a[2:5]
+// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_f32_4x4x1f32 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_f32_4x4x1f32 a[0:3], a0, a1, a[2:5]
+// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_f32_4x4x1f32 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_f32_4x4x1f32 v[0:3], v0, v1, v[2:5]
+// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_f32_4x4x1f32 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_f32_4x4x1f32 v[0:3], v0, a1, v[2:5]
+// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_f32_4x4x1f32 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_f32_4x4x1f32 v[0:3], a0, v1, v[2:5]
+// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_f32_4x4x1f32 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_f32_4x4x1f32 v[0:3], a0, a1, v[2:5]
+// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_f32_4x4x1f32 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_f32_4x4x1f32 a[0:3], v0, v1, -2.0
+// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, -2.0 ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x03]
+
+v_mfma_f32_4x4x1f32 v[0:3], v0, v1, -2.0
+// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, v1, -2.0 ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x03]
+
+v_mfma_f32_4x4x1f32 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xe3]
+
+v_mfma_f32_4x4x1f32 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xe3]
+
+v_mfma_f32_4x4x1f32 a[0:3], v0, a1, -2.0
+// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, a1, -2.0 ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x13]
+
+v_mfma_f32_4x4x1f32 v[0:3], v0, a1, -2.0
+// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, a1, -2.0 ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x13]
+
+v_mfma_f32_4x4x1f32 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xf3]
+
+v_mfma_f32_4x4x1f32 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xf3]
+
+v_mfma_f32_4x4x1f32 a[0:3], a0, v1, -2.0
+// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, v1, -2.0 ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x0b]
+
+v_mfma_f32_4x4x1f32 v[0:3], a0, v1, -2.0
+// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, v1, -2.0 ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x0b]
+
+v_mfma_f32_4x4x1f32 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xeb]
+
+v_mfma_f32_4x4x1f32 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xeb]
+
+v_mfma_f32_4x4x1f32 a[0:3], a0, a1, -2.0
+// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, a1, -2.0 ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x1b]
+
+v_mfma_f32_4x4x1f32 v[0:3], a0, a1, -2.0
+// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, a1, -2.0 ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x1b]
+
+v_mfma_f32_4x4x1f32 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xfb]
+
+v_mfma_f32_4x4x1f32 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xfb]
+
+v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[2:17]
+// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_f32_32x32x2f32 a[0:15], v0, a1, a[2:17]
+// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_f32_32x32x2f32 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_f32_32x32x2f32 a[0:15], a0, v1, a[2:17]
+// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_f32_32x32x2f32 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_f32_32x32x2f32 a[0:15], a0, a1, a[2:17]
+// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_f32_32x32x2f32 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_f32_32x32x2f32 v[0:15], v0, v1, v[2:17]
+// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_f32_32x32x2f32 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_f32_32x32x2f32 v[0:15], v0, a1, v[2:17]
+// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_f32_32x32x2f32 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_f32_32x32x2f32 v[0:15], a0, v1, v[2:17]
+// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_f32_32x32x2f32 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_f32_32x32x2f32 v[0:15], a0, a1, v[2:17]
+// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_f32_32x32x2f32 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_f32_32x32x2f32 a[0:15], v0, v1, -2.0
+// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, -2.0 ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x03]
+
+v_mfma_f32_32x32x2f32 v[0:15], v0, v1, -2.0
+// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, v1, -2.0 ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x03]
+
+v_mfma_f32_32x32x2f32 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xe3]
+
+v_mfma_f32_32x32x2f32 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xe3]
+
+v_mfma_f32_32x32x2f32 a[0:15], v0, a1, -2.0
+// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, a1, -2.0 ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x13]
+
+v_mfma_f32_32x32x2f32 v[0:15], v0, a1, -2.0
+// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, a1, -2.0 ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x13]
+
+v_mfma_f32_32x32x2f32 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xf3]
+
+v_mfma_f32_32x32x2f32 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xf3]
+
+v_mfma_f32_32x32x2f32 a[0:15], a0, v1, -2.0
+// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, v1, -2.0 ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x0b]
+
+v_mfma_f32_32x32x2f32 v[0:15], a0, v1, -2.0
+// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, v1, -2.0 ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x0b]
+
+v_mfma_f32_32x32x2f32 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xeb]
+
+v_mfma_f32_32x32x2f32 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xeb]
+
+v_mfma_f32_32x32x2f32 a[0:15], a0, a1, -2.0
+// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, a1, -2.0 ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x1b]
+
+v_mfma_f32_32x32x2f32 v[0:15], a0, a1, -2.0
+// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, a1, -2.0 ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x1b]
+
+v_mfma_f32_32x32x2f32 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xfb]
+
+v_mfma_f32_32x32x2f32 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xfb]
+
+v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[2:5]
+// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_f32_16x16x4f32 a[0:3], v0, a1, a[2:5]
+// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_f32_16x16x4f32 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_f32_16x16x4f32 a[0:3], a0, v1, a[2:5]
+// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_f32_16x16x4f32 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_f32_16x16x4f32 a[0:3], a0, a1, a[2:5]
+// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_f32_16x16x4f32 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_f32_16x16x4f32 v[0:3], v0, v1, v[2:5]
+// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_f32_16x16x4f32 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_f32_16x16x4f32 v[0:3], v0, a1, v[2:5]
+// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_f32_16x16x4f32 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_f32_16x16x4f32 v[0:3], a0, v1, v[2:5]
+// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_f32_16x16x4f32 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_f32_16x16x4f32 v[0:3], a0, a1, v[2:5]
+// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_f32_16x16x4f32 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_f32_16x16x4f32 a[0:3], v0, v1, -2.0
+// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, -2.0 ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x03]
+
+v_mfma_f32_16x16x4f32 v[0:3], v0, v1, -2.0
+// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, v1, -2.0 ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x03]
+
+v_mfma_f32_16x16x4f32 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xe3]
+
+v_mfma_f32_16x16x4f32 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xe3]
+
+v_mfma_f32_16x16x4f32 a[0:3], v0, a1, -2.0
+// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, a1, -2.0 ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x13]
+
+v_mfma_f32_16x16x4f32 v[0:3], v0, a1, -2.0
+// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, a1, -2.0 ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x13]
+
+v_mfma_f32_16x16x4f32 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xf3]
+
+v_mfma_f32_16x16x4f32 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xf3]
+
+v_mfma_f32_16x16x4f32 a[0:3], a0, v1, -2.0
+// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, v1, -2.0 ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x0b]
+
+v_mfma_f32_16x16x4f32 v[0:3], a0, v1, -2.0
+// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, v1, -2.0 ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x0b]
+
+v_mfma_f32_16x16x4f32 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xeb]
+
+v_mfma_f32_16x16x4f32 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xeb]
+
+v_mfma_f32_16x16x4f32 a[0:3], a0, a1, -2.0
+// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, a1, -2.0 ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x1b]
+
+v_mfma_f32_16x16x4f32 v[0:3], a0, a1, -2.0
+// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, a1, -2.0 ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x1b]
+
+v_mfma_f32_16x16x4f32 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xfb]
+
+v_mfma_f32_16x16x4f32 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xfb]
+
+v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[2:33]
+// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[2:33] ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], a[2:33]
+// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], a[2:33] ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x14]
+
+v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], a[2:33]
+// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], a[2:33] ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xec]
+
+v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], a[2:33]
+// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], a[2:33] ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x1c]
+
+v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xfc]
+
+v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], v[2:33]
+// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], v[2:33] ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], v[2:33]
+// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], v[2:33] ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x14]
+
+v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], v[2:33]
+// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], v[2:33] ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xec]
+
+v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], v[2:33]
+// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], v[2:33] ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x1c]
+
+v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xfc]
+
+v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x13]
+
+v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x13]
+
+v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xf3]
+
+v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xf3]
+
+v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x0b]
+
+v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x0b]
+
+v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xeb]
+
+v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xeb]
+
+v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xfb]
+
+v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xfb]
+
+v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[2:17]
+// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], a[2:17]
+// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x14]
+
+v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], a[2:17]
+// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xec]
+
+v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], a[2:17]
+// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x1c]
+
+v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xfc]
+
+v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], v[2:17]
+// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], v[2:17]
+// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x14]
+
+v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], v[2:17]
+// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xec]
+
+v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], v[2:17]
+// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x1c]
+
+v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xfc]
+
+v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x13]
+
+v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x13]
+
+v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xf3]
+
+v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xf3]
+
+v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x0b]
+
+v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x0b]
+
+v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xeb]
+
+v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xeb]
+
+v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xfb]
+
+v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xfb]
+
+v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[2:5]
+// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], a[2:5]
+// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x14]
+
+v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], a[2:5]
+// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xec]
+
+v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], a[2:5]
+// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x1c]
+
+v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xfc]
+
+v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], v[2:5]
+// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], v[2:5]
+// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x14]
+
+v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], v[2:5]
+// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xec]
+
+v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], v[2:5]
+// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x1c]
+
+v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xfc]
+
+v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x13]
+
+v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x13]
+
+v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xf3]
+
+v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xf3]
+
+v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x0b]
+
+v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x0b]
+
+v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xeb]
+
+v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xeb]
+
+v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xfb]
+
+v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xfb]
+
+v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[2:17]
+// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], a[2:17]
+// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x14]
+
+v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], a[2:17]
+// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xec]
+
+v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], a[2:17]
+// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x1c]
+
+v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xfc]
+
+v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], v[2:17]
+// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], v[2:17]
+// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x14]
+
+v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], v[2:17]
+// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xec]
+
+v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], v[2:17]
+// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x1c]
+
+v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xfc]
+
+v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x13]
+
+v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x13]
+
+v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xf3]
+
+v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xf3]
+
+v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x0b]
+
+v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x0b]
+
+v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xeb]
+
+v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xeb]
+
+v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xfb]
+
+v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xfb]
+
+v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[2:5]
+// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], a[2:5]
+// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x14]
+
+v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], a[2:5]
+// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xec]
+
+v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], a[2:5]
+// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x1c]
+
+v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xfc]
+
+v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], v[2:5]
+// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], v[2:5]
+// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x14]
+
+v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], v[2:5]
+// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xec]
+
+v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], v[2:5]
+// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x1c]
+
+v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xfc]
+
+v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x13]
+
+v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x13]
+
+v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xf3]
+
+v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xf3]
+
+v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x0b]
+
+v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x0b]
+
+v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xeb]
+
+v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xeb]
+
+v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xfb]
+
+v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xfb]
+
+v_mfma_i32_32x32x4i8 a[0:31], v0, v1, a[2:33]
+// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, v1, a[2:33] ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_i32_32x32x4i8 a[0:31], v0, v1, a[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_i32_32x32x4i8 a[0:31], v0, a1, a[2:33]
+// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, a1, a[2:33] ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_i32_32x32x4i8 a[0:31], v0, a1, a[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_i32_32x32x4i8 a[0:31], a0, v1, a[2:33]
+// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, v1, a[2:33] ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_i32_32x32x4i8 a[0:31], a0, v1, a[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_i32_32x32x4i8 a[0:31], a0, a1, a[2:33]
+// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, a1, a[2:33] ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_i32_32x32x4i8 a[0:31], a0, a1, a[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_i32_32x32x4i8 v[0:31], v0, v1, v[2:33]
+// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, v1, v[2:33] ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_i32_32x32x4i8 v[0:31], v0, v1, v[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_i32_32x32x4i8 v[0:31], v0, a1, v[2:33]
+// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, a1, v[2:33] ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_i32_32x32x4i8 v[0:31], v0, a1, v[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_i32_32x32x4i8 v[0:31], a0, v1, v[2:33]
+// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, v1, v[2:33] ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_i32_32x32x4i8 v[0:31], a0, v1, v[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_i32_32x32x4i8 v[0:31], a0, a1, v[2:33]
+// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, a1, v[2:33] ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_i32_32x32x4i8 v[0:31], a0, a1, v[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_i32_32x32x4i8 a[0:31], v0, v1, 2
+// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, v1, 2 ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x02]
+
+v_mfma_i32_32x32x4i8 v[0:31], v0, v1, 2
+// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, v1, 2 ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x02]
+
+v_mfma_i32_32x32x4i8 a[0:31], v0, v1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xe2]
+
+v_mfma_i32_32x32x4i8 v[0:31], v0, v1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xe2]
+
+v_mfma_i32_32x32x4i8 a[0:31], v0, a1, 2
+// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, a1, 2 ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x12]
+
+v_mfma_i32_32x32x4i8 v[0:31], v0, a1, 2
+// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, a1, 2 ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x12]
+
+v_mfma_i32_32x32x4i8 a[0:31], v0, a1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xf2]
+
+v_mfma_i32_32x32x4i8 v[0:31], v0, a1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xf2]
+
+v_mfma_i32_32x32x4i8 a[0:31], a0, v1, 2
+// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, v1, 2 ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x0a]
+
+v_mfma_i32_32x32x4i8 v[0:31], a0, v1, 2
+// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, v1, 2 ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x0a]
+
+v_mfma_i32_32x32x4i8 a[0:31], a0, v1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xea]
+
+v_mfma_i32_32x32x4i8 v[0:31], a0, v1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xea]
+
+v_mfma_i32_32x32x4i8 a[0:31], a0, a1, 2
+// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, a1, 2 ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x1a]
+
+v_mfma_i32_32x32x4i8 v[0:31], a0, a1, 2
+// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, a1, 2 ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x1a]
+
+v_mfma_i32_32x32x4i8 a[0:31], a0, a1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xfa]
+
+v_mfma_i32_32x32x4i8 v[0:31], a0, a1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xfa]
+
+v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[2:17]
+// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_i32_16x16x4i8 a[0:15], v0, a1, a[2:17]
+// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_i32_16x16x4i8 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_i32_16x16x4i8 a[0:15], a0, v1, a[2:17]
+// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_i32_16x16x4i8 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_i32_16x16x4i8 a[0:15], a0, a1, a[2:17]
+// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_i32_16x16x4i8 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_i32_16x16x4i8 v[0:15], v0, v1, v[2:17]
+// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_i32_16x16x4i8 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_i32_16x16x4i8 v[0:15], v0, a1, v[2:17]
+// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_i32_16x16x4i8 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_i32_16x16x4i8 v[0:15], a0, v1, v[2:17]
+// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_i32_16x16x4i8 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_i32_16x16x4i8 v[0:15], a0, a1, v[2:17]
+// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_i32_16x16x4i8 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 2
+// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 2 ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x02]
+
+v_mfma_i32_16x16x4i8 v[0:15], v0, v1, 2
+// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, v1, 2 ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x02]
+
+v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xe2]
+
+v_mfma_i32_16x16x4i8 v[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xe2]
+
+v_mfma_i32_16x16x4i8 a[0:15], v0, a1, 2
+// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, a1, 2 ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x12]
+
+v_mfma_i32_16x16x4i8 v[0:15], v0, a1, 2
+// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, a1, 2 ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x12]
+
+v_mfma_i32_16x16x4i8 a[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xf2]
+
+v_mfma_i32_16x16x4i8 v[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xf2]
+
+v_mfma_i32_16x16x4i8 a[0:15], a0, v1, 2
+// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, v1, 2 ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x0a]
+
+v_mfma_i32_16x16x4i8 v[0:15], a0, v1, 2
+// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, v1, 2 ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x0a]
+
+v_mfma_i32_16x16x4i8 a[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xea]
+
+v_mfma_i32_16x16x4i8 v[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xea]
+
+v_mfma_i32_16x16x4i8 a[0:15], a0, a1, 2
+// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, a1, 2 ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x1a]
+
+v_mfma_i32_16x16x4i8 v[0:15], a0, a1, 2
+// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, a1, 2 ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x1a]
+
+v_mfma_i32_16x16x4i8 a[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xfa]
+
+v_mfma_i32_16x16x4i8 v[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xfa]
+
+v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[2:5]
+// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_i32_4x4x4i8 a[0:3], v0, a1, a[2:5]
+// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_i32_4x4x4i8 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_i32_4x4x4i8 a[0:3], a0, v1, a[2:5]
+// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_i32_4x4x4i8 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_i32_4x4x4i8 a[0:3], a0, a1, a[2:5]
+// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_i32_4x4x4i8 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_i32_4x4x4i8 v[0:3], v0, v1, v[2:5]
+// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_i32_4x4x4i8 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_i32_4x4x4i8 v[0:3], v0, a1, v[2:5]
+// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_i32_4x4x4i8 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_i32_4x4x4i8 v[0:3], a0, v1, v[2:5]
+// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_i32_4x4x4i8 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_i32_4x4x4i8 v[0:3], a0, a1, v[2:5]
+// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_i32_4x4x4i8 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_i32_4x4x4i8 a[0:3], v0, v1, 2
+// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, 2 ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x02]
+
+v_mfma_i32_4x4x4i8 v[0:3], v0, v1, 2
+// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, v1, 2 ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x02]
+
+v_mfma_i32_4x4x4i8 a[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xe2]
+
+v_mfma_i32_4x4x4i8 v[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xe2]
+
+v_mfma_i32_4x4x4i8 a[0:3], v0, a1, 2
+// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, a1, 2 ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x12]
+
+v_mfma_i32_4x4x4i8 v[0:3], v0, a1, 2
+// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, a1, 2 ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x12]
+
+v_mfma_i32_4x4x4i8 a[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xf2]
+
+v_mfma_i32_4x4x4i8 v[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xf2]
+
+v_mfma_i32_4x4x4i8 a[0:3], a0, v1, 2
+// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, v1, 2 ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x0a]
+
+v_mfma_i32_4x4x4i8 v[0:3], a0, v1, 2
+// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, v1, 2 ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x0a]
+
+v_mfma_i32_4x4x4i8 a[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xea]
+
+v_mfma_i32_4x4x4i8 v[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xea]
+
+v_mfma_i32_4x4x4i8 a[0:3], a0, a1, 2
+// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, a1, 2 ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x1a]
+
+v_mfma_i32_4x4x4i8 v[0:3], a0, a1, 2
+// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, a1, 2 ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x1a]
+
+v_mfma_i32_4x4x4i8 a[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xfa]
+
+v_mfma_i32_4x4x4i8 v[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xfa]
+
+v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[2:17]
+// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_i32_32x32x8i8 a[0:15], v0, a1, a[2:17]
+// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_i32_32x32x8i8 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_i32_32x32x8i8 a[0:15], a0, v1, a[2:17]
+// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_i32_32x32x8i8 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_i32_32x32x8i8 a[0:15], a0, a1, a[2:17]
+// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_i32_32x32x8i8 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_i32_32x32x8i8 v[0:15], v0, v1, v[2:17]
+// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_i32_32x32x8i8 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_i32_32x32x8i8 v[0:15], v0, a1, v[2:17]
+// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_i32_32x32x8i8 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_i32_32x32x8i8 v[0:15], a0, v1, v[2:17]
+// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_i32_32x32x8i8 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_i32_32x32x8i8 v[0:15], a0, a1, v[2:17]
+// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_i32_32x32x8i8 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_i32_32x32x8i8 a[0:15], v0, v1, 2
+// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, 2 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x02]
+
+v_mfma_i32_32x32x8i8 v[0:15], v0, v1, 2
+// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, v1, 2 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x02]
+
+v_mfma_i32_32x32x8i8 a[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xe2]
+
+v_mfma_i32_32x32x8i8 v[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xe2]
+
+v_mfma_i32_32x32x8i8 a[0:15], v0, a1, 2
+// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, a1, 2 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x12]
+
+v_mfma_i32_32x32x8i8 v[0:15], v0, a1, 2
+// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, a1, 2 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x12]
+
+v_mfma_i32_32x32x8i8 a[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xf2]
+
+v_mfma_i32_32x32x8i8 v[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xf2]
+
+v_mfma_i32_32x32x8i8 a[0:15], a0, v1, 2
+// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, v1, 2 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x0a]
+
+v_mfma_i32_32x32x8i8 v[0:15], a0, v1, 2
+// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, v1, 2 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x0a]
+
+v_mfma_i32_32x32x8i8 a[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xea]
+
+v_mfma_i32_32x32x8i8 v[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xea]
+
+v_mfma_i32_32x32x8i8 a[0:15], a0, a1, 2
+// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, a1, 2 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x1a]
+
+v_mfma_i32_32x32x8i8 v[0:15], a0, a1, 2
+// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, a1, 2 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x1a]
+
+v_mfma_i32_32x32x8i8 a[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xfa]
+
+v_mfma_i32_32x32x8i8 v[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xfa]
+
+v_mfma_i32_16x16x16i8 a[0:3], v0, v1, a[2:5]
+// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_i32_16x16x16i8 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_i32_16x16x16i8 a[0:3], v0, a1, a[2:5]
+// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_i32_16x16x16i8 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_i32_16x16x16i8 a[0:3], a0, v1, a[2:5]
+// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_i32_16x16x16i8 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_i32_16x16x16i8 a[0:3], a0, a1, a[2:5]
+// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_i32_16x16x16i8 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_i32_16x16x16i8 v[0:3], v0, v1, v[2:5]
+// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_i32_16x16x16i8 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_i32_16x16x16i8 v[0:3], v0, a1, v[2:5]
+// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_i32_16x16x16i8 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_i32_16x16x16i8 v[0:3], a0, v1, v[2:5]
+// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_i32_16x16x16i8 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_i32_16x16x16i8 v[0:3], a0, a1, v[2:5]
+// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_i32_16x16x16i8 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_i32_16x16x16i8 a[0:3], v0, v1, 2
+// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, 2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x02]
+
+v_mfma_i32_16x16x16i8 v[0:3], v0, v1, 2
+// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, v1, 2 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x02]
+
+v_mfma_i32_16x16x16i8 a[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xe2]
+
+v_mfma_i32_16x16x16i8 v[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xe2]
+
+v_mfma_i32_16x16x16i8 a[0:3], v0, a1, 2
+// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, a1, 2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x12]
+
+v_mfma_i32_16x16x16i8 v[0:3], v0, a1, 2
+// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, a1, 2 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x12]
+
+v_mfma_i32_16x16x16i8 a[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xf2]
+
+v_mfma_i32_16x16x16i8 v[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xf2]
+
+v_mfma_i32_16x16x16i8 a[0:3], a0, v1, 2
+// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, v1, 2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x0a]
+
+v_mfma_i32_16x16x16i8 v[0:3], a0, v1, 2
+// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, v1, 2 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x0a]
+
+v_mfma_i32_16x16x16i8 a[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xea]
+
+v_mfma_i32_16x16x16i8 v[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xea]
+
+v_mfma_i32_16x16x16i8 a[0:3], a0, a1, 2
+// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, a1, 2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x1a]
+
+v_mfma_i32_16x16x16i8 v[0:3], a0, a1, 2
+// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, a1, 2 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x1a]
+
+v_mfma_i32_16x16x16i8 a[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xfa]
+
+v_mfma_i32_16x16x16i8 v[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xfa]
+
+v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, a[2:33]
+// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, a[2:33] ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, a[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, a[2:33]
+// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, a[2:33] ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, a[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, a[2:33]
+// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, a[2:33] ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, a[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, a[2:33]
+// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, a[2:33] ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, a[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, v[2:33]
+// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, v[2:33] ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, v[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, v[2:33]
+// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, v[2:33] ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, v[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, v[2:33]
+// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, v[2:33] ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, v[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, v[2:33]
+// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, v[2:33] ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, v[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, -2.0
+// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, -2.0 ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x03]
+
+v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, -2.0
+// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, -2.0 ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x03]
+
+v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xe3]
+
+v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xe3]
+
+v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, -2.0
+// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, -2.0 ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x13]
+
+v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, -2.0
+// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, -2.0 ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x13]
+
+v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xf3]
+
+v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xf3]
+
+v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, -2.0
+// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, -2.0 ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x0b]
+
+v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, -2.0
+// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, -2.0 ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x0b]
+
+v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xeb]
+
+v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xeb]
+
+v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, -2.0
+// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, -2.0 ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x1b]
+
+v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, -2.0
+// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, -2.0 ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x1b]
+
+v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xfb]
+
+v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xfb]
+
+v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[2:17]
+// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, a[2:17]
+// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, a[2:17]
+// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, a[2:17]
+// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, v[2:17]
+// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, v[2:17]
+// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, v[2:17]
+// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, v[2:17]
+// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, -2.0
+// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, -2.0 ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x03]
+
+v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, -2.0
+// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, -2.0 ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x03]
+
+v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xe3]
+
+v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xe3]
+
+v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, -2.0
+// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, -2.0 ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x13]
+
+v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, -2.0
+// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, -2.0 ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x13]
+
+v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xf3]
+
+v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xf3]
+
+v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, -2.0
+// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, -2.0 ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x0b]
+
+v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, -2.0
+// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, -2.0 ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x0b]
+
+v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xeb]
+
+v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xeb]
+
+v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, -2.0
+// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, -2.0 ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x1b]
+
+v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, -2.0
+// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, -2.0 ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x1b]
+
+v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xfb]
+
+v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xfb]
+
+v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, a[2:5]
+// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, a[2:5]
+// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, a[2:5]
+// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, a[2:5]
+// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, v[2:5]
+// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, v[2:5]
+// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, v[2:5]
+// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, v[2:5]
+// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, -2.0
+// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, -2.0 ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x03]
+
+v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, -2.0
+// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, -2.0 ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x03]
+
+v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xe3]
+
+v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xe3]
+
+v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, -2.0
+// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, -2.0 ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x13]
+
+v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, -2.0
+// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, -2.0 ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x13]
+
+v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xf3]
+
+v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xf3]
+
+v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, -2.0
+// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, -2.0 ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x0b]
+
+v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, -2.0
+// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, -2.0 ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x0b]
+
+v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xeb]
+
+v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xeb]
+
+v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, -2.0
+// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, -2.0 ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x1b]
+
+v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, -2.0
+// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, -2.0 ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x1b]
+
+v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xfb]
+
+v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xfb]
+
+v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[2:17]
+// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, a[2:17]
+// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, a[2:17]
+// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, a[2:17]
+// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, v[2:17]
+// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, v[2:17]
+// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, v[2:17]
+// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, v[2:17]
+// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, -2.0
+// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, -2.0 ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x03]
+
+v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, -2.0
+// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, -2.0 ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x03]
+
+v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xe3]
+
+v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xe3]
+
+v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, -2.0
+// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, -2.0 ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x13]
+
+v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, -2.0
+// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, -2.0 ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x13]
+
+v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xf3]
+
+v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xf3]
+
+v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, -2.0
+// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, -2.0 ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x0b]
+
+v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, -2.0
+// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, -2.0 ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x0b]
+
+v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xeb]
+
+v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xeb]
+
+v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, -2.0
+// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, -2.0 ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x1b]
+
+v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, -2.0
+// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, -2.0 ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x1b]
+
+v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xfb]
+
+v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xfb]
+
+v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[2:5]
+// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, a[2:5]
+// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, a[2:5]
+// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, a[2:5]
+// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, v[2:5]
+// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x04]
+
+v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xe4]
+
+v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, v[2:5]
+// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x14]
+
+v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xf4]
+
+v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, v[2:5]
+// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x0c]
+
+v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xec]
+
+v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, v[2:5]
+// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x1c]
+
+v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xfc]
+
+v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, -2.0
+// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, -2.0 ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x03]
+
+v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, -2.0
+// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, -2.0 ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x03]
+
+v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xe3]
+
+v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xe3]
+
+v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, -2.0
+// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, -2.0 ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x13]
+
+v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, -2.0
+// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, -2.0 ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x13]
+
+v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xf3]
+
+v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xf3]
+
+v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, -2.0
+// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, -2.0 ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x0b]
+
+v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, -2.0
+// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, -2.0 ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x0b]
+
+v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xeb]
+
+v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xeb]
+
+v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, -2.0
+// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, -2.0 ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x1b]
+
+v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, -2.0
+// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, -2.0 ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x1b]
+
+v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xfb]
+
+v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xfb]
+
+v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[2:33]
+// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[2:33] ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], a[2:33]
+// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], a[2:33] ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x14]
+
+v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], a[2:33]
+// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], a[2:33] ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xec]
+
+v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], a[2:33]
+// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], a[2:33] ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x1c]
+
+v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xfc]
+
+v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], v[2:33]
+// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], v[2:33] ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], v[2:33]
+// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], v[2:33] ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x14]
+
+v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], v[2:33]
+// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], v[2:33] ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xec]
+
+v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], v[2:33]
+// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], v[2:33] ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x1c]
+
+v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xfc]
+
+v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x13]
+
+v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x13]
+
+v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xf3]
+
+v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xf3]
+
+v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x0b]
+
+v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x0b]
+
+v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xeb]
+
+v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xeb]
+
+v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xfb]
+
+v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xfb]
+
+v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[2:17]
+// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], a[2:17]
+// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x14]
+
+v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], a[2:17]
+// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xec]
+
+v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], a[2:17]
+// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x1c]
+
+v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xfc]
+
+v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], v[2:17]
+// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], v[2:17]
+// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x14]
+
+v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], v[2:17]
+// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xec]
+
+v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], v[2:17]
+// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x1c]
+
+v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xfc]
+
+v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x13]
+
+v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x13]
+
+v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xf3]
+
+v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xf3]
+
+v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x0b]
+
+v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x0b]
+
+v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xeb]
+
+v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xeb]
+
+v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xfb]
+
+v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xfb]
+
+v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[2:5]
+// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], a[2:5]
+// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x14]
+
+v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], a[2:5]
+// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xec]
+
+v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], a[2:5]
+// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x1c]
+
+v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xfc]
+
+v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], v[2:5]
+// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], v[2:5]
+// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x14]
+
+v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], v[2:5]
+// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xec]
+
+v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], v[2:5]
+// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x1c]
+
+v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xfc]
+
+v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x13]
+
+v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x13]
+
+v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xf3]
+
+v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xf3]
+
+v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x0b]
+
+v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x0b]
+
+v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xeb]
+
+v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xeb]
+
+v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xfb]
+
+v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xfb]
+
+v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[2:17]
+// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], a[2:17]
+// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x14]
+
+v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], a[2:17]
+// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xec]
+
+v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], a[2:17]
+// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x1c]
+
+v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xfc]
+
+v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], v[2:17]
+// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], v[2:17]
+// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x14]
+
+v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], v[2:17]
+// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xec]
+
+v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], v[2:17]
+// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x1c]
+
+v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xfc]
+
+v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x13]
+
+v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x13]
+
+v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xf3]
+
+v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xf3]
+
+v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x0b]
+
+v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x0b]
+
+v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xeb]
+
+v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xeb]
+
+v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xfb]
+
+v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xfb]
+
+v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[2:5]
+// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], a[2:5]
+// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x14]
+
+v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], a[2:5]
+// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xec]
+
+v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], a[2:5]
+// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x1c]
+
+v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xfc]
+
+v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], v[2:5]
+// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], v[2:5]
+// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x14]
+
+v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], v[2:5]
+// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xec]
+
+v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], v[2:5]
+// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x1c]
+
+v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xfc]
+
+v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x13]
+
+v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x13]
+
+v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xf3]
+
+v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xf3]
+
+v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x0b]
+
+v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x0b]
+
+v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xeb]
+
+v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xeb]
+
+v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xfb]
+
+v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xfb]
+
+v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[2:9]
+// GFX90A: v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[2:9] ; encoding: [0x00,0x00,0xee,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[2:9] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[2:9] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xee,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xee,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xee,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[2:3]
+// GFX90A: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[2:3] ; encoding: [0x00,0x00,0xef,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[2:3] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[2:3] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xef,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xef,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xef,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f64_16x16x4f64 v[0:7], a[0:1], v[2:3], v[2:9]
+// GFX90A: v_mfma_f64_16x16x4f64 v[0:7], a[0:1], v[2:3], v[2:9] ; encoding: [0x00,0x00,0xee,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f64_16x16x4f64 v[0:7], v[0:1], a[2:3], v[2:9] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f64_16x16x4f64 v[0:7], v[0:1], a[2:3], v[2:9] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xee,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f64_16x16x4f64 v[0:7], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f64_16x16x4f64 v[0:7], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xee,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f64_4x4x4f64 v[0:1], a[0:1], v[2:3], v[2:3]
+// GFX90A: v_mfma_f64_4x4x4f64 v[0:1], a[0:1], v[2:3], v[2:3] ; encoding: [0x00,0x00,0xef,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f64_4x4x4f64 v[0:1], v[0:1], a[2:3], v[2:3] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], a[2:3], v[2:3] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xef,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f64_4x4x4f64 v[0:1], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f64_4x4x4f64 v[0:1], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xef,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[2:9]
+// GFX90A: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[2:9] ; encoding: [0x00,0x80,0xee,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[2:9] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[2:9] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xee,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xee,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xee,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[2:3]
+// GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[2:3] ; encoding: [0x00,0x80,0xef,0xd3,0x00,0x05,0x0a,0x04]
+
+v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[2:3] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[2:3] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xef,0xd3,0x00,0x05,0x0a,0xe4]
+
+v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], -2.0
+// GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xef,0xd3,0x00,0x05,0xd6,0x03]
+
+v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0
+// GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0 ; encoding: [0x00,0x80,0xef,0xd3,0x00,0x05,0x02,0x02]
+
+v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xef,0xd3,0x00,0x05,0xd6,0xe3]
+
+v_mfma_f64_16x16x4f64 a[0:7], a[0:1], v[2:3], a[2:9]
+// GFX90A: v_mfma_f64_16x16x4f64 a[0:7], a[0:1], v[2:3], a[2:9] ; encoding: [0x00,0x80,0xee,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f64_16x16x4f64 a[0:7], v[0:1], a[2:3], a[2:9] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], a[2:3], a[2:9] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xee,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f64_16x16x4f64 a[0:7], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f64_16x16x4f64 a[0:7], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xee,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f64_4x4x4f64 a[0:1], a[0:1], v[2:3], a[2:3]
+// GFX90A: v_mfma_f64_4x4x4f64 a[0:1], a[0:1], v[2:3], a[2:3] ; encoding: [0x00,0x80,0xef,0xd3,0x00,0x05,0x0a,0x0c]
+
+v_mfma_f64_4x4x4f64 a[0:1], v[0:1], a[2:3], a[2:3] cbsz:3 abid:2 blgp:7
+// GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], a[2:3], a[2:3] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xef,0xd3,0x00,0x05,0x0a,0xf4]
+
+v_mfma_f64_4x4x4f64 a[0:1], a[0:1], a[2:3], -2.0
+// GFX90A: v_mfma_f64_4x4x4f64 a[0:1], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xef,0xd3,0x00,0x05,0xd6,0x1b]
+
+v_mfma_f64_16x16x4f64 a[0:7], a[0:1], a[2:3], 0
+// GFX90A: v_mfma_f64_16x16x4f64 a[0:7], a[0:1], a[2:3], 0 ; encoding: [0x00,0x80,0xee,0xd3,0x00,0x05,0x02,0x1a]

diff  --git a/llvm/test/MC/AMDGPU/mimg-gfx90a.s b/llvm/test/MC/AMDGPU/mimg-gfx90a.s
new file mode 100644
index 000000000000..92ddb2ed9d2c
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/mimg-gfx90a.s
@@ -0,0 +1,76 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -show-encoding %s | FileCheck --check-prefix=GFX90A %s
+
+image_load v[4:6], v[238:241], s[28:35] dmask:0x7 unorm
+// GFX90A: image_load v[4:6], v[238:241], s[28:35] dmask:0x7 unorm ; encoding: [0x00,0x17,0x00,0xf0,0xee,0x04,0x07,0x00]
+
+image_load_pck v5, v[0:3], s[8:15] dmask:0x1 glc
+// GFX90A: image_load_pck v5, v[0:3], s[8:15] dmask:0x1 glc ; encoding: [0x00,0x21,0x08,0xf0,0x00,0x05,0x02,0x00]
+
+image_load_pck_sgn v5, v[0:3], s[8:15] dmask:0x1 lwe
+// GFX90A: image_load_pck_sgn v5, v[0:3], s[8:15] dmask:0x1 lwe ; encoding: [0x00,0x01,0x0e,0xf0,0x00,0x05,0x02,0x00]
+
+image_load_mip v5, v[0:3], s[8:15]
+// GFX90A: image_load_mip v5, v[0:3], s[8:15] ; encoding: [0x00,0x00,0x04,0xf0,0x00,0x05,0x02,0x00]
+
+image_load_mip_pck v5, v1, s[8:15] dmask:0x1
+// GFX90A: image_load_mip_pck v5, v1, s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x10,0xf0,0x01,0x05,0x02,0x00]
+
+image_load_mip_pck_sgn v[4:5], v[0:3], s[8:15] dmask:0x5
+// GFX90A: image_load_mip_pck_sgn v[4:5], v[0:3], s[8:15] dmask:0x5 ; encoding: [0x00,0x05,0x14,0xf0,0x00,0x04,0x02,0x00]
+
+image_store v[192:194], v[238:241], s[28:35] dmask:0x7 unorm
+// GFX90A: image_store v[192:194], v[238:241], s[28:35] dmask:0x7 unorm ; encoding: [0x00,0x17,0x20,0xf0,0xee,0xc0,0x07,0x00]
+
+image_store_pck v1, v[2:5], s[12:19] dmask:0x1 unorm da
+// GFX90A: image_store_pck v1, v[2:5], s[12:19] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x28,0xf0,0x02,0x01,0x03,0x00]
+
+image_store_mip v1, v[2:5], s[12:19]
+// GFX90A: image_store_mip v1, v[2:5], s[12:19] ; encoding: [0x00,0x00,0x24,0xf0,0x02,0x01,0x03,0x00]
+
+image_store_mip_pck v252, v[2:3], s[12:19] dmask:0x1 a16
+// GFX90A: image_store_mip_pck v252, v[2:3], s[12:19] dmask:0x1 a16 ; encoding: [0x00,0x81,0x2c,0xf0,0x02,0xfc,0x03,0x00]
+
+image_atomic_add v4, v192, s[28:35] dmask:0x1 unorm glc
+// GFX90A: image_atomic_add v4, v192, s[28:35] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x48,0xf0,0xc0,0x04,0x07,0x00]
+
+image_atomic_and v4, v192, s[28:35] dmask:0x1 unorm
+// GFX90A: image_atomic_and v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x60,0xf0,0xc0,0x04,0x07,0x00]
+
+image_atomic_swap v4, v[192:195], s[28:35] dmask:0x1 unorm glc
+// GFX90A: image_atomic_swap v4, v[192:195], s[28:35] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0xc0,0x04,0x07,0x00]
+
+image_atomic_cmpswap v[4:5], v[192:195], s[28:35] dmask:0x3 unorm glc
+// GFX90A: image_atomic_cmpswap v[4:5], v[192:195], s[28:35] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x44,0xf0,0xc0,0x04,0x07,0x00]
+
+image_atomic_or v4, v192, s[28:35] dmask:0x1 unorm
+// GFX90A: image_atomic_or v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x64,0xf0,0xc0,0x04,0x07,0x00]
+
+image_atomic_xor v4, v192, s[28:35] dmask:0x1 unorm
+// GFX90A: image_atomic_xor v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x68,0xf0,0xc0,0x04,0x07,0x00]
+
+image_atomic_sub v4, v192, s[28:35] dmask:0x1 unorm
+// GFX90A: image_atomic_sub v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4c,0xf0,0xc0,0x04,0x07,0x00]
+
+image_atomic_smin v4, v192, s[28:35] dmask:0x1 unorm
+// GFX90A: image_atomic_smin v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x50,0xf0,0xc0,0x04,0x07,0x00]
+
+image_atomic_smax v4, v192, s[28:35] dmask:0x1 unorm
+// GFX90A: image_atomic_smax v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x58,0xf0,0xc0,0x04,0x07,0x00]
+
+image_atomic_umin v4, v192, s[28:35] dmask:0x1 unorm
+// GFX90A: image_atomic_umin v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x54,0xf0,0xc0,0x04,0x07,0x00]
+
+image_atomic_umax v4, v192, s[28:35] dmask:0x1 unorm
+// GFX90A: image_atomic_umax v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5c,0xf0,0xc0,0x04,0x07,0x00]
+
+image_atomic_inc v4, v192, s[28:35] dmask:0x1 unorm
+// GFX90A: image_atomic_inc v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6c,0xf0,0xc0,0x04,0x07,0x00]
+
+image_atomic_dec v4, v192, s[28:35] dmask:0x1 unorm
+// GFX90A: image_atomic_dec v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x70,0xf0,0xc0,0x04,0x07,0x00]
+
+image_get_resinfo v5, v1, s[8:15] dmask:0x1
+// GFX90A: image_get_resinfo v5, v1, s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x38,0xf0,0x01,0x05,0x02,0x00]
+
+image_sample v5, v[0:3], s[8:15], s[12:15] dmask:0x1
+// GFX90A: image_sample v5, v[0:3], s[8:15], s[12:15] dmask:0x1 ; encoding: [0x00,0x01,0x80,0xf0,0x00,0x05,0x62,0x00]

diff  --git a/llvm/test/MC/AMDGPU/misaligned-vgpr-tuples-err.s b/llvm/test/MC/AMDGPU/misaligned-vgpr-tuples-err.s
new file mode 100644
index 000000000000..0370e30a8c08
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/misaligned-vgpr-tuples-err.s
@@ -0,0 +1,25 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=GFX90A --implicit-check-not=error: %s
+
+v_add_f64 v[1:2], v[1:2], v[1:2]
+// GFX90A: error: invalid register class: vgpr tuples must be 64 bit aligned
+
+global_load_dwordx3 v[1:3], v[0:1], off
+// GFX90A: error: invalid register class: vgpr tuples must be 64 bit aligned
+
+global_load_dwordx4 v[1:4], v[0:1], off
+// GFX90A: error: invalid register class: vgpr tuples must be 64 bit aligned
+
+image_load v[1:5], v2, s[0:7] dmask:0xf unorm scc
+// GFX90A: error: invalid register class: vgpr tuples must be 64 bit aligned
+
+v_mfma_f32_32x32x8f16 a[0:15], a[1:2], v[0:1], a[0:15]
+// GFX90A: error: invalid register class: vgpr tuples must be 64 bit aligned
+
+v_mfma_i32_4x4x4i8 a[1:4], a0, v1, 2
+// GFX90A: error: invalid register class: vgpr tuples must be 64 bit aligned
+
+v_mfma_f32_16x16x1f32 a[0:15], a0, v1, a[1:16]
+// GFX90A: error: invalid register class: vgpr tuples must be 64 bit aligned
+
+v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[1:32]
+// GFX90A: error: invalid register class: vgpr tuples must be 64 bit aligned

diff  --git a/llvm/test/MC/AMDGPU/vop_dpp.s b/llvm/test/MC/AMDGPU/vop_dpp.s
index 74a31c6f9719..55bf42790041 100644
--- a/llvm/test/MC/AMDGPU/vop_dpp.s
+++ b/llvm/test/MC/AMDGPU/vop_dpp.s
@@ -116,8 +116,7 @@ v_add_f32 v0, |v0|, -v0 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
 //===----------------------------------------------------------------------===//
 
 // NOSICI: error: not a valid operand.
-// NOGFX9: error: not a valid operand.
-// NOVI: error: not a valid operand.
+// GCN: v_nop row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x00,0x00,0x7e,0x00,0x01,0x09,0xa1]
 v_nop row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
 
 // NOSICI: error: not a valid operand.

diff  --git a/llvm/test/MC/Disassembler/AMDGPU/dpp64.txt b/llvm/test/MC/Disassembler/AMDGPU/dpp64.txt
new file mode 100644
index 000000000000..b11d2b422134
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/dpp64.txt
@@ -0,0 +1,43 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -disassemble -show-encoding < %s | FileCheck %s -check-prefix=GFX90A
+
+# GFX90A: v_ceil_f64_dpp v[0:1], v[2:3]  row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x30,0x00,0x7e,0x02,0x51,0x01,0xff]
+0xfa,0x30,0x00,0x7e,0x02,0x51,0x01,0xff
+
+# GFX90A: v_fmac_f64_dpp v[0:1], v[2:3], v[4:5]  row_newbcast:2 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x08,0x00,0x08,0x02,0x52,0x01,0xff]
+0xfa,0x08,0x00,0x08,0x02,0x52,0x01,0xff
+
+# GFX90A: v_cvt_f32_f64_dpp v5, v[0:1]  row_newbcast:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x1e,0x0a,0x7e,0x00,0x5f,0x01,0xff]
+0xfa,0x1e,0x0a,0x7e,0x00,0x5f,0x01,0xff
+
+# GFX90A: v_cvt_i32_f64_dpp v5, v[0:1]  row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x06,0x0a,0x7e,0x00,0x51,0x01,0xff]
+0xfa,0x06,0x0a,0x7e,0x00,0x51,0x01,0xff
+
+# GFX90A: v_cvt_u32_f64_dpp v5, v[0:1]  row_newbcast:2 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x2a,0x0a,0x7e,0x00,0x52,0x01,0xff]
+0xfa,0x2a,0x0a,0x7e,0x00,0x52,0x01,0xff
+
+# GFX90A: v_floor_f64_dpp v[4:5], v[0:1]  row_newbcast:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x34,0x08,0x7e,0x00,0x5f,0x01,0xff]
+0xfa,0x34,0x08,0x7e,0x00,0x5f,0x01,0xff
+
+# GFX90A: v_fract_f64_dpp v[4:5], v[0:1]  row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x64,0x08,0x7e,0x00,0x51,0x01,0xff]
+0xfa,0x64,0x08,0x7e,0x00,0x51,0x01,0xff
+
+# GFX90A: v_frexp_exp_i32_f64_dpp v5, v[0:1]  row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x60,0x0a,0x7e,0x00,0x51,0x01,0xff]
+0xfa,0x60,0x0a,0x7e,0x00,0x51,0x01,0xff
+
+# GFX90A: v_frexp_mant_f64_dpp v[4:5], v[0:1]  row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x62,0x08,0x7e,0x00,0x51,0x01,0xff]
+0xfa,0x62,0x08,0x7e,0x00,0x51,0x01,0xff
+
+# GFX90A: v_rcp_f64_dpp v[4:5], v[0:1]  row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x4a,0x08,0x7e,0x00,0x51,0x01,0xff]
+0xfa,0x4a,0x08,0x7e,0x00,0x51,0x01,0xff
+
+# GFX90A: v_rndne_f64_dpp v[4:5], v[0:1]  row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x32,0x08,0x7e,0x00,0x51,0x01,0xff]
+0xfa,0x32,0x08,0x7e,0x00,0x51,0x01,0xff
+
+# GFX90A: v_rsq_f64_dpp v[4:5], v[0:1]  row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x4c,0x08,0x7e,0x00,0x51,0x01,0xff]
+0xfa,0x4c,0x08,0x7e,0x00,0x51,0x01,0xff
+
+# GFX90A: v_sqrt_f64_dpp v[4:5], v[0:1]  row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x50,0x08,0x7e,0x00,0x51,0x01,0xff]
+0xfa,0x50,0x08,0x7e,0x00,0x51,0x01,0xff
+
+# GFX90A: v_trunc_f64_dpp v[4:5], v[0:1]  row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x2e,0x08,0x7e,0x00,0x51,0x01,0xff]
+0xfa,0x2e,0x08,0x7e,0x00,0x51,0x01,0xff

diff  --git a/llvm/test/MC/Disassembler/AMDGPU/gfx90a_dasm_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx90a_dasm_features.txt
new file mode 100644
index 000000000000..80922b576b56
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx90a_dasm_features.txt
@@ -0,0 +1,795 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=gfx908 -disassemble -show-encoding %s | FileCheck --check-prefix=GFX908 %s
+# RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -disassemble -show-encoding %s | FileCheck --check-prefix=GFX90A %s
+
+# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c]
+0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c
+
+# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c]
+0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c
+
+# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c]
+0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c
+
+# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c]
+0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c
+
+# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0xb0,0xd3,0x00,0x01,0x10,0x04]
+0x08,0x00,0xb0,0xd3,0x00,0x01,0x10,0x04
+
+# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0xb0,0xd3,0x00,0x01,0x10,0x04]
+0x08,0x60,0xb0,0xd3,0x00,0x01,0x10,0x04
+
+# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[1,1,1] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0xfc]
+0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0xfc
+
+# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0xb0,0xd3,0x00,0x01,0x10,0x1c]
+0x08,0x47,0xb0,0xd3,0x00,0x01,0x10,0x1c
+
+# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0xb0,0xd3,0x00,0x01,0x10,0xfc]
+0x08,0x47,0xb0,0xd3,0x00,0x01,0x10,0xfc
+
+# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x3c]
+0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x3c
+
+# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x5c]
+0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x5c
+
+# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x9c]
+0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x9c
+
+# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0xb0,0xd3,0x00,0x01,0x10,0x1c]
+0x08,0x41,0xb0,0xd3,0x00,0x01,0x10,0x1c
+
+# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0xb0,0xd3,0x00,0x01,0x10,0x1c]
+0x08,0x42,0xb0,0xd3,0x00,0x01,0x10,0x1c
+
+# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0xb0,0xd3,0x00,0x01,0x10,0x1c]
+0x08,0x44,0xb0,0xd3,0x00,0x01,0x10,0x1c
+
+# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] clamp ; encoding: [0x08,0xc0,0xb0,0xd3,0x00,0x01,0x10,0x1c]
+0x08,0xc0,0xb0,0xd3,0x00,0x01,0x10,0x1c
+
+# GFX90A: v_pk_fma_f32 v[0:1], v[4:5], v[8:9], v[16:17] ; encoding: [0x00,0x40,0xb0,0xd3,0x04,0x11,0x42,0x1c]
+0x00,0x40,0xb0,0xd3,0x04,0x11,0x42,0x1c
+
+# GFX90A: v_pk_mul_f32 v[254:255], v[8:9], v[16:17] ; encoding: [0xfe,0x00,0xb1,0xd3,0x08,0x21,0x02,0x18]
+0xfe,0x00,0xb1,0xd3,0x08,0x21,0x02,0x18
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[254:255], v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0xfe,0x21,0x02,0x18]
+0x04,0x00,0xb1,0xd3,0xfe,0x21,0x02,0x18
+
+# GFX90A: v_pk_mul_f32 v[4:5], s[2:3], v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x02,0x20,0x02,0x18]
+0x04,0x00,0xb1,0xd3,0x02,0x20,0x02,0x18
+
+# GFX90A: v_pk_mul_f32 v[4:5], s[100:101], v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x64,0x20,0x02,0x18]
+0x04,0x00,0xb1,0xd3,0x64,0x20,0x02,0x18
+
+# GFX90A: v_pk_mul_f32 v[4:5], flat_scratch, v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x66,0x20,0x02,0x18]
+0x04,0x00,0xb1,0xd3,0x66,0x20,0x02,0x18
+
+# GFX90A: v_pk_mul_f32 v[4:5], vcc, v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x6a,0x20,0x02,0x18]
+0x04,0x00,0xb1,0xd3,0x6a,0x20,0x02,0x18
+
+# GFX90A: v_pk_mul_f32 v[4:5], exec, v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x7e,0x20,0x02,0x18]
+0x04,0x00,0xb1,0xd3,0x7e,0x20,0x02,0x18
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[254:255] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0xfd,0x03,0x18]
+0x04,0x00,0xb1,0xd3,0x08,0xfd,0x03,0x18
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], s[2:3] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x05,0x00,0x18]
+0x04,0x00,0xb1,0xd3,0x08,0x05,0x00,0x18
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], s[100:101] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0xc9,0x00,0x18]
+0x04,0x00,0xb1,0xd3,0x08,0xc9,0x00,0x18
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], flat_scratch ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0xcd,0x00,0x18]
+0x04,0x00,0xb1,0xd3,0x08,0xcd,0x00,0x18
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], vcc ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0xd5,0x00,0x18]
+0x04,0x00,0xb1,0xd3,0x08,0xd5,0x00,0x18
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], exec ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0xfd,0x00,0x18]
+0x04,0x00,0xb1,0xd3,0x08,0xfd,0x00,0x18
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x18]
+0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x18
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0] ; encoding: [0x04,0x08,0xb1,0xd3,0x08,0x21,0x02,0x18]
+0x04,0x08,0xb1,0xd3,0x08,0x21,0x02,0x18
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1] ; encoding: [0x04,0x10,0xb1,0xd3,0x08,0x21,0x02,0x18]
+0x04,0x10,0xb1,0xd3,0x08,0x21,0x02,0x18
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1] ; encoding: [0x04,0x18,0xb1,0xd3,0x08,0x21,0x02,0x18]
+0x04,0x18,0xb1,0xd3,0x08,0x21,0x02,0x18
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x18]
+0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x18
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x00]
+0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x00
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x08]
+0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x08
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x10]
+0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x10
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x38]
+0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x38
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x58]
+0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x58
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x78]
+0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x78
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0] ; encoding: [0x04,0x01,0xb1,0xd3,0x08,0x21,0x02,0x18]
+0x04,0x01,0xb1,0xd3,0x08,0x21,0x02,0x18
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1] ; encoding: [0x04,0x02,0xb1,0xd3,0x08,0x21,0x02,0x18]
+0x04,0x02,0xb1,0xd3,0x08,0x21,0x02,0x18
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1] ; encoding: [0x04,0x03,0xb1,0xd3,0x08,0x21,0x02,0x18]
+0x04,0x03,0xb1,0xd3,0x08,0x21,0x02,0x18
+
+# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] clamp ; encoding: [0x04,0x80,0xb1,0xd3,0x08,0x21,0x02,0x18]
+0x04,0x80,0xb1,0xd3,0x08,0x21,0x02,0x18
+
+# GFX90A: v_pk_add_f32 v[254:255], v[8:9], v[16:17] ; encoding: [0xfe,0x00,0xb2,0xd3,0x08,0x21,0x02,0x18]
+0xfe,0x00,0xb2,0xd3,0x08,0x21,0x02,0x18
+
+# GFX90A: v_pk_add_f32 v[4:5], v[254:255], v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0xfe,0x21,0x02,0x18]
+0x04,0x00,0xb2,0xd3,0xfe,0x21,0x02,0x18
+
+# GFX90A: v_pk_add_f32 v[4:5], s[2:3], v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x02,0x20,0x02,0x18]
+0x04,0x00,0xb2,0xd3,0x02,0x20,0x02,0x18
+
+# GFX90A: v_pk_add_f32 v[4:5], s[100:101], v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x64,0x20,0x02,0x18]
+0x04,0x00,0xb2,0xd3,0x64,0x20,0x02,0x18
+
+# GFX90A: v_pk_add_f32 v[4:5], flat_scratch, v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x66,0x20,0x02,0x18]
+0x04,0x00,0xb2,0xd3,0x66,0x20,0x02,0x18
+
+# GFX90A: v_pk_add_f32 v[4:5], vcc, v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x6a,0x20,0x02,0x18]
+0x04,0x00,0xb2,0xd3,0x6a,0x20,0x02,0x18
+
+# GFX90A: v_pk_add_f32 v[4:5], exec, v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x7e,0x20,0x02,0x18]
+0x04,0x00,0xb2,0xd3,0x7e,0x20,0x02,0x18
+
+# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[254:255] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0xfd,0x03,0x18]
+0x04,0x00,0xb2,0xd3,0x08,0xfd,0x03,0x18
+
+# GFX90A: v_pk_add_f32 v[4:5], v[8:9], s[2:3] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x05,0x00,0x18]
+0x04,0x00,0xb2,0xd3,0x08,0x05,0x00,0x18
+
+# GFX90A: v_pk_add_f32 v[4:5], v[8:9], s[100:101] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0xc9,0x00,0x18]
+0x04,0x00,0xb2,0xd3,0x08,0xc9,0x00,0x18
+
+# GFX90A: v_pk_add_f32 v[4:5], v[8:9], flat_scratch ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0xcd,0x00,0x18]
+0x04,0x00,0xb2,0xd3,0x08,0xcd,0x00,0x18
+
+# GFX90A: v_pk_add_f32 v[4:5], v[8:9], vcc ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0xd5,0x00,0x18]
+0x04,0x00,0xb2,0xd3,0x08,0xd5,0x00,0x18
+
+# GFX90A: v_pk_add_f32 v[4:5], v[8:9], exec ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0xfd,0x00,0x18]
+0x04,0x00,0xb2,0xd3,0x08,0xfd,0x00,0x18
+
+# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x18]
+0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x18
+
+# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0] ; encoding: [0x04,0x08,0xb2,0xd3,0x08,0x21,0x02,0x18]
+0x04,0x08,0xb2,0xd3,0x08,0x21,0x02,0x18
+
+# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1] ; encoding: [0x04,0x10,0xb2,0xd3,0x08,0x21,0x02,0x18]
+0x04,0x10,0xb2,0xd3,0x08,0x21,0x02,0x18
+
+# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1] ; encoding: [0x04,0x18,0xb2,0xd3,0x08,0x21,0x02,0x18]
+0x04,0x18,0xb2,0xd3,0x08,0x21,0x02,0x18
+
+# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x18]
+0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x18
+
+# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x00]
+0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x00
+
+# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x08]
+0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x08
+
+# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x10]
+0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x10
+
+# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x38]
+0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x38
+
+# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x58]
+0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x58
+
+# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x78]
+0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x78
+
+# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0] ; encoding: [0x04,0x01,0xb2,0xd3,0x08,0x21,0x02,0x18]
+0x04,0x01,0xb2,0xd3,0x08,0x21,0x02,0x18
+
+# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1] ; encoding: [0x04,0x02,0xb2,0xd3,0x08,0x21,0x02,0x18]
+0x04,0x02,0xb2,0xd3,0x08,0x21,0x02,0x18
+
+# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1] ; encoding: [0x04,0x03,0xb2,0xd3,0x08,0x21,0x02,0x18]
+0x04,0x03,0xb2,0xd3,0x08,0x21,0x02,0x18
+
+# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] clamp ; encoding: [0x04,0x80,0xb2,0xd3,0x08,0x21,0x02,0x18]
+0x04,0x80,0xb2,0xd3,0x08,0x21,0x02,0x18
+
+# GFX90A: v_pk_mov_b32 v[0:1], v[2:3], v[4:5] ; encoding: [0x00,0x00,0xb3,0xd3,0x02,0x09,0x02,0x18]
+0x00,0x00,0xb3,0xd3,0x02,0x09,0x02,0x18
+
+# GFX90A: v_pk_mov_b32 v[0:1], flat_scratch, v[4:5] ; encoding: [0x00,0x00,0xb3,0xd3,0x66,0x08,0x02,0x18]
+0x00,0x00,0xb3,0xd3,0x66,0x08,0x02,0x18
+
+# GFX90A: v_pk_mov_b32 v[0:1], v[2:3], vcc ; encoding: [0x00,0x00,0xb3,0xd3,0x02,0xd5,0x00,0x18]
+0x00,0x00,0xb3,0xd3,0x02,0xd5,0x00,0x18
+
+# GFX90A: v_pk_mov_b32 v[0:1], v[2:3], s[0:1] ; encoding: [0x00,0x00,0xb3,0xd3,0x02,0x01,0x00,0x18]
+0x00,0x00,0xb3,0xd3,0x02,0x01,0x00,0x18
+
+# GFX90A: v_pk_mov_b32 v[0:1], v[2:3], v[4:5] op_sel_hi:[0,1] ; encoding: [0x00,0x00,0xb3,0xd3,0x02,0x09,0x02,0x10]
+0x00,0x00,0xb3,0xd3,0x02,0x09,0x02,0x10
+
+# GFX90A: v_pk_mov_b32 v[0:1], v[2:3], v[4:5] op_sel:[1,0] ; encoding: [0x00,0x08,0xb3,0xd3,0x02,0x09,0x02,0x18]
+0x00,0x08,0xb3,0xd3,0x02,0x09,0x02,0x18
+
+# GFX90A: v_pk_mov_b32 v[0:1], v[2:3], v[4:5] op_sel:[1,1] ; encoding: [0x00,0x18,0xb3,0xd3,0x02,0x09,0x02,0x18]
+0x00,0x18,0xb3,0xd3,0x02,0x09,0x02,0x18
+
+# GFX908: tbuffer_load_format_xyzw v[4:7], off, s[0:3], 0 ; encoding: [0x00,0x80,0x09,0xe8,0x00,0x04,0x20,0x80]
+# GFX90A: tbuffer_load_format_xyzw v[4:7], off, s[0:3], 0 scc ; encoding: [0x00,0x80,0x09,0xe8,0x00,0x04,0x20,0x80]
+0x00,0x80,0x09,0xe8,0x00,0x04,0x20,0x80
+
+# GFX908: tbuffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc ; encoding: [0x00,0xc0,0x09,0xe8,0x00,0x04,0x20,0x80]
+# GFX90A: tbuffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc scc ; encoding: [0x00,0xc0,0x09,0xe8,0x00,0x04,0x20,0x80]
+0x00,0xc0,0x09,0xe8,0x00,0x04,0x20,0x80
+
+# GFX908: buffer_load_dword v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x8f,0x50,0xe0,0x00,0x05,0x02,0x03]
+# GFX90A: buffer_load_dword v5, off, s[8:11], s3 offset:4095 scc ; encoding: [0xff,0x8f,0x50,0xe0,0x00,0x05,0x02,0x03]
+0xff,0x8f,0x50,0xe0,0x00,0x05,0x02,0x03
+
+# GFX908: buffer_load_dword v5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0xcf,0x50,0xe0,0x00,0x05,0x02,0x03]
+# GFX90A: buffer_load_dword v5, off, s[8:11], s3 offset:4095 glc scc ; encoding: [0xff,0xcf,0x50,0xe0,0x00,0x05,0x02,0x03]
+0xff,0xcf,0x50,0xe0,0x00,0x05,0x02,0x03
+
+# GFX90A: buffer_wbl2 ; encoding: [0x00,0x00,0xa0,0xe0,0x00,0x00,0x00,0x00]
+0x00,0x00,0xa0,0xe0,0x00,0x00,0x00,0x00
+
+# GFX90A: buffer_invl2 ; encoding: [0x00,0x00,0xa4,0xe0,0x00,0x00,0x00,0x00]
+0x00,0x00,0xa4,0xe0,0x00,0x00,0x00,0x00
+
+# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x03]
+0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x03
+
+# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x03,0x03]
+0xff,0x0f,0x3c,0xe1,0x00,0x04,0x03,0x03
+
+# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x18,0x03]
+0xff,0x0f,0x3c,0xe1,0x00,0x04,0x18,0x03
+
+# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x65]
+0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x65
+
+# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x7c]
+0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x7c
+
+# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x80]
+0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x80
+
+# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0xc1]
+0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0xc1
+
+# GFX90A: buffer_atomic_add_f64 v[4:5], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x3c,0xe1,0x00,0x04,0x02,0x03]
+0xff,0x2f,0x3c,0xe1,0x00,0x04,0x02,0x03
+
+# GFX90A: buffer_atomic_add_f64 v[4:5], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x3c,0xe1,0x00,0x04,0x02,0x03]
+0xff,0x1f,0x3c,0xe1,0x00,0x04,0x02,0x03
+
+# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x3c,0xe1,0x00,0x04,0x02,0x03]
+0x00,0x00,0x3c,0xe1,0x00,0x04,0x02,0x03
+
+# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x3c,0xe1,0x00,0x04,0x02,0x03]
+0x00,0x00,0x3c,0xe1,0x00,0x04,0x02,0x03
+
+# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x3c,0xe1,0x00,0x04,0x02,0x03]
+0x07,0x00,0x3c,0xe1,0x00,0x04,0x02,0x03
+
+# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x3e,0xe1,0x00,0x04,0x02,0x03]
+0xff,0x0f,0x3e,0xe1,0x00,0x04,0x02,0x03
+
+# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x03]
+0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x03
+
+# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x03,0x03]
+0xff,0x0f,0x40,0xe1,0x00,0x04,0x03,0x03
+
+# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x18,0x03]
+0xff,0x0f,0x40,0xe1,0x00,0x04,0x18,0x03
+
+# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x65]
+0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x65
+
+# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x7c]
+0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x7c
+
+# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x80]
+0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x80
+
+# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0xc1]
+0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0xc1
+
+# GFX90A: buffer_atomic_min_f64 v[4:5], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x40,0xe1,0x00,0x04,0x02,0x03]
+0xff,0x2f,0x40,0xe1,0x00,0x04,0x02,0x03
+
+# GFX90A: buffer_atomic_min_f64 v[4:5], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x40,0xe1,0x00,0x04,0x02,0x03]
+0xff,0x1f,0x40,0xe1,0x00,0x04,0x02,0x03
+
+# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x40,0xe1,0x00,0x04,0x02,0x03]
+0x00,0x00,0x40,0xe1,0x00,0x04,0x02,0x03
+
+# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x40,0xe1,0x00,0x04,0x02,0x03]
+0x00,0x00,0x40,0xe1,0x00,0x04,0x02,0x03
+
+# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x40,0xe1,0x00,0x04,0x02,0x03]
+0x07,0x00,0x40,0xe1,0x00,0x04,0x02,0x03
+
+# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x42,0xe1,0x00,0x04,0x02,0x03]
+0xff,0x0f,0x42,0xe1,0x00,0x04,0x02,0x03
+
+# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x03]
+0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x03
+
+# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x03,0x03]
+0xff,0x0f,0x44,0xe1,0x00,0x04,0x03,0x03
+
+# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x18,0x03]
+0xff,0x0f,0x44,0xe1,0x00,0x04,0x18,0x03
+
+# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x65]
+0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x65
+
+# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x7c]
+0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x7c
+
+# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x80]
+0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x80
+
+# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0xc1]
+0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0xc1
+
+# GFX90A: buffer_atomic_max_f64 v[4:5], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x44,0xe1,0x00,0x04,0x02,0x03]
+0xff,0x2f,0x44,0xe1,0x00,0x04,0x02,0x03
+
+# GFX90A: buffer_atomic_max_f64 v[4:5], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x44,0xe1,0x00,0x04,0x02,0x03]
+0xff,0x1f,0x44,0xe1,0x00,0x04,0x02,0x03
+
+# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x44,0xe1,0x00,0x04,0x02,0x03]
+0x00,0x00,0x44,0xe1,0x00,0x04,0x02,0x03
+
+# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x44,0xe1,0x00,0x04,0x02,0x03]
+0x00,0x00,0x44,0xe1,0x00,0x04,0x02,0x03
+
+# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x44,0xe1,0x00,0x04,0x02,0x03]
+0x07,0x00,0x44,0xe1,0x00,0x04,0x02,0x03
+
+# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x46,0xe1,0x00,0x04,0x02,0x03]
+0xff,0x0f,0x46,0xe1,0x00,0x04,0x02,0x03
+
+# GFX90A: ds_add_f64 v1, v[2:3] offset:65535 ; encoding: [0xff,0xff,0xb8,0xd8,0x01,0x02,0x00,0x00]
+0xff,0xff,0xb8,0xd8,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_add_f64 v255, v[2:3] offset:65535 ; encoding: [0xff,0xff,0xb8,0xd8,0xff,0x02,0x00,0x00]
+0xff,0xff,0xb8,0xd8,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_add_f64 v1, v[254:255] offset:65535 ; encoding: [0xff,0xff,0xb8,0xd8,0x01,0xfe,0x00,0x00]
+0xff,0xff,0xb8,0xd8,0x01,0xfe,0x00,0x00
+
+# GFX90A: ds_add_f64 v1, v[2:3] ; encoding: [0x00,0x00,0xb8,0xd8,0x01,0x02,0x00,0x00]
+0x00,0x00,0xb8,0xd8,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_add_f64 v1, v[2:3] ; encoding: [0x00,0x00,0xb8,0xd8,0x01,0x02,0x00,0x00]
+0x00,0x00,0xb8,0xd8,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_add_f64 v1, v[2:3] offset:4 ; encoding: [0x04,0x00,0xb8,0xd8,0x01,0x02,0x00,0x00]
+0x04,0x00,0xb8,0xd8,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_add_f64 v1, v[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xb9,0xd8,0x01,0x02,0x00,0x00]
+0xff,0xff,0xb9,0xd8,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_add_rtn_f64 v[4:5], v1, v[2:3] offset:65535 ; encoding: [0xff,0xff,0xf8,0xd8,0x01,0x02,0x00,0x04]
+0xff,0xff,0xf8,0xd8,0x01,0x02,0x00,0x04
+
+# GFX90A: ds_add_rtn_f64 v[254:255], v1, v[2:3] offset:65535 ; encoding: [0xff,0xff,0xf8,0xd8,0x01,0x02,0x00,0xfe]
+0xff,0xff,0xf8,0xd8,0x01,0x02,0x00,0xfe
+
+# GFX90A: ds_add_rtn_f64 v[4:5], v255, v[2:3] offset:65535 ; encoding: [0xff,0xff,0xf8,0xd8,0xff,0x02,0x00,0x04]
+0xff,0xff,0xf8,0xd8,0xff,0x02,0x00,0x04
+
+# GFX90A: ds_add_rtn_f64 v[4:5], v1, v[254:255] offset:65535 ; encoding: [0xff,0xff,0xf8,0xd8,0x01,0xfe,0x00,0x04]
+0xff,0xff,0xf8,0xd8,0x01,0xfe,0x00,0x04
+
+# GFX90A: ds_add_rtn_f64 v[4:5], v1, v[2:3] ; encoding: [0x00,0x00,0xf8,0xd8,0x01,0x02,0x00,0x04]
+0x00,0x00,0xf8,0xd8,0x01,0x02,0x00,0x04
+
+# GFX90A: ds_add_rtn_f64 v[4:5], v1, v[2:3] ; encoding: [0x00,0x00,0xf8,0xd8,0x01,0x02,0x00,0x04]
+0x00,0x00,0xf8,0xd8,0x01,0x02,0x00,0x04
+
+# GFX90A: ds_add_rtn_f64 v[4:5], v1, v[2:3] offset:4 ; encoding: [0x04,0x00,0xf8,0xd8,0x01,0x02,0x00,0x04]
+0x04,0x00,0xf8,0xd8,0x01,0x02,0x00,0x04
+
+# GFX90A: ds_add_rtn_f64 v[4:5], v1, v[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xf9,0xd8,0x01,0x02,0x00,0x04]
+0xff,0xff,0xf9,0xd8,0x01,0x02,0x00,0x04
+
+# GFX908: flat_load_dword v0, v[0:1] ; encoding: [0x00,0x00,0x50,0xde,0x00,0x00,0x00,0x00]
+# GFX90A: flat_load_dword v0, v[0:1] scc ; encoding: [0x00,0x00,0x50,0xde,0x00,0x00,0x00,0x00]
+0x00,0x00,0x50,0xde,0x00,0x00,0x00,0x00
+
+# GFX908: flat_load_dword v0, v[0:1] glc ; encoding: [0x00,0x00,0x51,0xde,0x00,0x00,0x00,0x00]
+# GFX90A: flat_load_dword v0, v[0:1] glc scc ; encoding: [0x00,0x00,0x51,0xde,0x00,0x00,0x00,0x00]
+0x00,0x00,0x51,0xde,0x00,0x00,0x00,0x00
+
+# GFX90A: flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x3c,0xdd,0x00,0x02,0x00,0x00]
+0xff,0x0f,0x3c,0xdd,0x00,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc scc ; encoding: [0xff,0x0f,0x3d,0xdf,0x00,0x02,0x00,0x00]
+0xff,0x0f,0x3d,0xdf,0x00,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_add_f64 v[254:255], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x3c,0xdd,0xfe,0x02,0x00,0x00]
+0xff,0x0f,0x3c,0xdd,0xfe,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_add_f64 v[0:1], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x3c,0xdd,0x00,0xfe,0x00,0x00]
+0xff,0x0f,0x3c,0xdd,0x00,0xfe,0x00,0x00
+
+# GFX90A: flat_atomic_add_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x3c,0xdd,0x00,0x02,0x00,0x00]
+0x00,0x00,0x3c,0xdd,0x00,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_add_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x3c,0xdd,0x00,0x02,0x00,0x00]
+0x00,0x00,0x3c,0xdd,0x00,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_add_f64 v[0:1], v[2:3] offset:7 ; encoding: [0x07,0x00,0x3c,0xdd,0x00,0x02,0x00,0x00]
+0x07,0x00,0x3c,0xdd,0x00,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x3d,0xdd,0x00,0x02,0x00,0x00]
+0xff,0x0f,0x3d,0xdd,0x00,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x3e,0xdd,0x00,0x02,0x00,0x00]
+0xff,0x0f,0x3e,0xdd,0x00,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_min_f64 v[0:1], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdd,0x00,0x02,0x00,0x00]
+0xff,0x0f,0x40,0xdd,0x00,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_min_f64 v[254:255], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdd,0xfe,0x02,0x00,0x00]
+0xff,0x0f,0x40,0xdd,0xfe,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_min_f64 v[0:1], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdd,0x00,0xfe,0x00,0x00]
+0xff,0x0f,0x40,0xdd,0x00,0xfe,0x00,0x00
+
+# GFX90A: flat_atomic_min_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x40,0xdd,0x00,0x02,0x00,0x00]
+0x00,0x00,0x40,0xdd,0x00,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_min_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x40,0xdd,0x00,0x02,0x00,0x00]
+0x00,0x00,0x40,0xdd,0x00,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_min_f64 v[0:1], v[2:3] offset:7 ; encoding: [0x07,0x00,0x40,0xdd,0x00,0x02,0x00,0x00]
+0x07,0x00,0x40,0xdd,0x00,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x41,0xdd,0x00,0x02,0x00,0x00]
+0xff,0x0f,0x41,0xdd,0x00,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_min_f64 v[0:1], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x42,0xdd,0x00,0x02,0x00,0x00]
+0xff,0x0f,0x42,0xdd,0x00,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_max_f64 v[0:1], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdd,0x00,0x02,0x00,0x00]
+0xff,0x0f,0x44,0xdd,0x00,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_max_f64 v[254:255], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdd,0xfe,0x02,0x00,0x00]
+0xff,0x0f,0x44,0xdd,0xfe,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_max_f64 v[0:1], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdd,0x00,0xfe,0x00,0x00]
+0xff,0x0f,0x44,0xdd,0x00,0xfe,0x00,0x00
+
+# GFX90A: flat_atomic_max_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x44,0xdd,0x00,0x02,0x00,0x00]
+0x00,0x00,0x44,0xdd,0x00,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_max_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x44,0xdd,0x00,0x02,0x00,0x00]
+0x00,0x00,0x44,0xdd,0x00,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_max_f64 v[0:1], v[2:3] offset:7 ; encoding: [0x07,0x00,0x44,0xdd,0x00,0x02,0x00,0x00]
+0x07,0x00,0x44,0xdd,0x00,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x45,0xdd,0x00,0x02,0x00,0x00]
+0xff,0x0f,0x45,0xdd,0x00,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_max_f64 v[0:1], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x46,0xdd,0x00,0x02,0x00,0x00]
+0xff,0x0f,0x46,0xdd,0x00,0x02,0x00,0x00
+
+# GFX90A: global_atomic_add_f64 v[0:1], v[2:3], off ; encoding: [0x00,0x80,0x3c,0xdd,0x00,0x02,0x7f,0x00]
+0x00,0x80,0x3c,0xdd,0x00,0x02,0x7f,0x00
+
+# GFX90A: global_atomic_min_f64 v[0:1], v[2:3], off ; encoding: [0x00,0x80,0x40,0xdd,0x00,0x02,0x7f,0x00]
+0x00,0x80,0x40,0xdd,0x00,0x02,0x7f,0x00
+
+# GFX90A: global_atomic_max_f64 v[0:1], v[2:3], off ; encoding: [0x00,0x80,0x44,0xdd,0x00,0x02,0x7f,0x00]
+0x00,0x80,0x44,0xdd,0x00,0x02,0x7f,0x00
+
+# GFX908: image_load v[0:4], v2, s[0:7] dmask:0xf unorm tfe ; encoding: [0x80,0x1f,0x01,0xf0,0x02,0x00,0x00,0x00]
+# GFX90A: image_load a0, v2, s[0:7] dmask:0xf unorm scc ; encoding: [0x80,0x1f,0x01,0xf0,0x02,0x00,0x00,0x00]
+0x80,0x1f,0x01,0xf0,0x02,0x00,0x00,0x00
+
+# GFX90A: v_fmac_f64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x08]
+0x02,0x09,0x08,0x08
+
+# GFX90A: v_fmac_f64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x09]
+0x02,0x09,0xfc,0x09
+
+# GFX90A: v_fmac_f64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x08]
+0xfe,0x09,0x08,0x08
+
+# GFX90A: v_fmac_f64_e32 v[4:5], flat_scratch, v[4:5] ; encoding: [0x66,0x08,0x08,0x08]
+0x66,0x08,0x08,0x08
+
+# GFX90A: v_fmac_f64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x08]
+0x6a,0x08,0x08,0x08
+
+# GFX90A: v_fmac_f64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x08]
+0x7e,0x08,0x08,0x08
+
+# GFX90A: v_fmac_f64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x08]
+0x80,0x08,0x08,0x08
+
+# GFX90A: v_fmac_f64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x08]
+0xc1,0x08,0x08,0x08
+
+# GFX90A: v_fmac_f64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x08]
+0xf0,0x08,0x08,0x08
+
+# GFX90A: v_fmac_f64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x08]
+0xf7,0x08,0x08,0x08
+
+# GFX90A: v_fmac_f64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xff,0x08,0x08,0x08,0x56,0x34,0x12,0xaf]
+0xff,0x08,0x08,0x08,0x56,0x34,0x12,0xaf
+
+# GFX90A: v_fmac_f64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x08,0x73,0x72,0x71,0x3f]
+0xff,0x08,0x08,0x08,0x73,0x72,0x71,0x3f
+
+# GFX90A: v_fmac_f64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x08]
+0x02,0xfd,0x09,0x08
+
+# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x00]
+0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x00
+
+# GFX90A: v_fmac_f64_e64 v[254:255], v[2:3], v[8:9] ; encoding: [0xfe,0x00,0x04,0xd1,0x02,0x11,0x02,0x00]
+0xfe,0x00,0x04,0xd1,0x02,0x11,0x02,0x00
+
+# GFX90A: v_fmac_f64_e64 v[4:5], v[254:255], v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0xfe,0x11,0x02,0x00]
+0x04,0x00,0x04,0xd1,0xfe,0x11,0x02,0x00
+
+# GFX90A: v_fmac_f64_e64 v[4:5], flat_scratch, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x66,0x10,0x02,0x00]
+0x04,0x00,0x04,0xd1,0x66,0x10,0x02,0x00
+
+# GFX90A: v_fmac_f64_e64 v[4:5], vcc, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x6a,0x10,0x02,0x00]
+0x04,0x00,0x04,0xd1,0x6a,0x10,0x02,0x00
+
+# GFX90A: v_fmac_f64_e64 v[4:5], exec, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x7e,0x10,0x02,0x00]
+0x04,0x00,0x04,0xd1,0x7e,0x10,0x02,0x00
+
+# GFX90A: v_fmac_f64_e64 v[4:5], 0, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x80,0x10,0x02,0x00]
+0x04,0x00,0x04,0xd1,0x80,0x10,0x02,0x00
+
+# GFX90A: v_fmac_f64_e64 v[4:5], -1, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0xc1,0x10,0x02,0x00]
+0x04,0x00,0x04,0xd1,0xc1,0x10,0x02,0x00
+
+# GFX90A: v_fmac_f64_e64 v[4:5], 0.5, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0xf0,0x10,0x02,0x00]
+0x04,0x00,0x04,0xd1,0xf0,0x10,0x02,0x00
+
+# GFX90A: v_fmac_f64_e64 v[4:5], -4.0, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0xf7,0x10,0x02,0x00]
+0x04,0x00,0x04,0xd1,0xf7,0x10,0x02,0x00
+
+# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[254:255] ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xfd,0x03,0x00]
+0x04,0x00,0x04,0xd1,0x02,0xfd,0x03,0x00
+
+# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], flat_scratch ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xcd,0x00,0x00]
+0x04,0x00,0x04,0xd1,0x02,0xcd,0x00,0x00
+
+# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xd5,0x00,0x00]
+0x04,0x00,0x04,0xd1,0x02,0xd5,0x00,0x00
+
+# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xfd,0x00,0x00]
+0x04,0x00,0x04,0xd1,0x02,0xfd,0x00,0x00
+
+# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x01,0x01,0x00]
+0x04,0x00,0x04,0xd1,0x02,0x01,0x01,0x00
+
+# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x83,0x01,0x00]
+0x04,0x00,0x04,0xd1,0x02,0x83,0x01,0x00
+
+# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xe1,0x01,0x00]
+0x04,0x00,0x04,0xd1,0x02,0xe1,0x01,0x00
+
+# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xef,0x01,0x00]
+0x04,0x00,0x04,0xd1,0x02,0xef,0x01,0x00
+
+# GFX90A: v_fmac_f64_e64 v[4:5], -v[2:3], v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x20]
+0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x20
+
+# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], -v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x40]
+0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x40
+
+# GFX90A: v_fmac_f64_e64 v[4:5], -v[2:3], -v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x60]
+0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x60
+
+# GFX90A: v_fmac_f64_e64 v[4:5], |v[2:3]|, v[8:9] ; encoding: [0x04,0x01,0x04,0xd1,0x02,0x11,0x02,0x00]
+0x04,0x01,0x04,0xd1,0x02,0x11,0x02,0x00
+
+# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], |v[8:9]| ; encoding: [0x04,0x02,0x04,0xd1,0x02,0x11,0x02,0x00]
+0x04,0x02,0x04,0xd1,0x02,0x11,0x02,0x00
+
+# GFX90A: v_fmac_f64_e64 v[4:5], |v[2:3]|, |v[8:9]| ; encoding: [0x04,0x03,0x04,0xd1,0x02,0x11,0x02,0x00]
+0x04,0x03,0x04,0xd1,0x02,0x11,0x02,0x00
+
+# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x04,0xd1,0x02,0x11,0x02,0x00]
+0x04,0x80,0x04,0xd1,0x02,0x11,0x02,0x00
+
+# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] mul:2 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x08]
+0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x08
+
+# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] mul:4 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x10]
+0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x10
+
+# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] div:2 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x18]
+0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x18
+
+# GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00]
+0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00]
+0xff,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, v255, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xff,0x05,0x02,0x00]
+0x05,0x00,0xa1,0xd2,0xff,0x05,0x02,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, s1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x04,0x02,0x00]
+0x05,0x00,0xa1,0xd2,0x01,0x04,0x02,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, s101, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x65,0x04,0x02,0x00]
+0x05,0x00,0xa1,0xd2,0x65,0x04,0x02,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x6a,0x04,0x02,0x00]
+0x05,0x00,0xa1,0xd2,0x6a,0x04,0x02,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, vcc_hi, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x6b,0x04,0x02,0x00]
+0x05,0x00,0xa1,0xd2,0x6b,0x04,0x02,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, m0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7c,0x04,0x02,0x00]
+0x05,0x00,0xa1,0xd2,0x7c,0x04,0x02,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, exec_lo, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7e,0x04,0x02,0x00]
+0x05,0x00,0xa1,0xd2,0x7e,0x04,0x02,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, exec_hi, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7f,0x04,0x02,0x00]
+0x05,0x00,0xa1,0xd2,0x7f,0x04,0x02,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, 0, v2  ; encoding: [0x05,0x00,0xa1,0xd2,0x80,0x04,0x02,0x00]
+0x05,0x00,0xa1,0xd2,0x80,0x04,0x02,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, -1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xc1,0x04,0x02,0x00]
+0x05,0x00,0xa1,0xd2,0xc1,0x04,0x02,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, 0.5, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xf0,0x04,0x02,0x00]
+0x05,0x00,0xa1,0xd2,0xf0,0x04,0x02,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, -4.0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xf7,0x04,0x02,0x00]
+0x05,0x00,0xa1,0xd2,0xf7,0x04,0x02,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, v1, v255 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xff,0x03,0x00]
+0x05,0x00,0xa1,0xd2,0x01,0xff,0x03,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, v1, s2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x00,0x00]
+0x05,0x00,0xa1,0xd2,0x01,0x05,0x00,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, v1, s101 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xcb,0x00,0x00]
+0x05,0x00,0xa1,0xd2,0x01,0xcb,0x00,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, v1, vcc_lo ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xd5,0x00,0x00]
+0x05,0x00,0xa1,0xd2,0x01,0xd5,0x00,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, v1, vcc_hi ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xd7,0x00,0x00]
+0x05,0x00,0xa1,0xd2,0x01,0xd7,0x00,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, v1, m0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xf9,0x00,0x00]
+0x05,0x00,0xa1,0xd2,0x01,0xf9,0x00,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, v1, exec_lo ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xfd,0x00,0x00]
+0x05,0x00,0xa1,0xd2,0x01,0xfd,0x00,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, v1, exec_hi ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xff,0x00,0x00]
+0x05,0x00,0xa1,0xd2,0x01,0xff,0x00,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, v1, 0  ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x01,0x01,0x00]
+0x05,0x00,0xa1,0xd2,0x01,0x01,0x01,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, v1, -1 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x83,0x01,0x00]
+0x05,0x00,0xa1,0xd2,0x01,0x83,0x01,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, v1, 0.5 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xe1,0x01,0x00]
+0x05,0x00,0xa1,0xd2,0x01,0xe1,0x01,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, v1, -4.0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xef,0x01,0x00]
+0x05,0x00,0xa1,0xd2,0x01,0xef,0x01,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, -v1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x20]
+0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x20
+
+# GFX90A: v_mul_legacy_f32_e64 v5, v1, -v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x40]
+0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x40
+
+# GFX90A: v_mul_legacy_f32_e64 v5, -v1, -v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x60]
+0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x60
+
+# GFX90A: v_mul_legacy_f32_e64 v5, |v1|, v2 ; encoding: [0x05,0x01,0xa1,0xd2,0x01,0x05,0x02,0x00]
+0x05,0x01,0xa1,0xd2,0x01,0x05,0x02,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, v1, |v2| ; encoding: [0x05,0x02,0xa1,0xd2,0x01,0x05,0x02,0x00]
+0x05,0x02,0xa1,0xd2,0x01,0x05,0x02,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, |v1|, |v2| ; encoding: [0x05,0x03,0xa1,0xd2,0x01,0x05,0x02,0x00]
+0x05,0x03,0xa1,0xd2,0x01,0x05,0x02,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 clamp ; encoding: [0x05,0x80,0xa1,0xd2,0x01,0x05,0x02,0x00]
+0x05,0x80,0xa1,0xd2,0x01,0x05,0x02,0x00
+
+# GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 mul:2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x08]
+0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x08
+
+# GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 mul:4 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x10]
+0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x10
+
+# GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 div:2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x18]
+0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x18
+
+# GFX90A: v_xor_b32_dpp v6, v29, v27  row_newbcast:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x36,0x0c,0x2a,0x1d,0x50,0x01,0xff]
+0xfa,0x36,0x0c,0x2a,0x1d,0x50,0x01,0xff
+
+# GFX90A: v_xor_b32_dpp v6, v29, v27  row_newbcast:7 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x36,0x0c,0x2a,0x1d,0x57,0x01,0xff]
+0xfa,0x36,0x0c,0x2a,0x1d,0x57,0x01,0xff
+
+# GFX90A: v_xor_b32_dpp v6, v29, v27  row_newbcast:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x36,0x0c,0x2a,0x1d,0x5f,0x01,0xff]
+0xfa,0x36,0x0c,0x2a,0x1d,0x5f,0x01,0xff
+
+# GFX90A: buffer_atomic_add_f32 v0, v2, s[4:7], 0 idxen glc ; encoding: [0x00,0x60,0x34,0xe1,0x02,0x00,0x01,0x80]
+0x00,0x60,0x34,0xe1,0x02,0x00,0x01,0x80
+
+# GFX90A: buffer_atomic_add_f32 v0, v2, s[4:7], 0 idxen glc ; encoding: [0x00,0x60,0x34,0xe1,0x02,0x00,0x01,0x80]
+0x00,0x60,0x34,0xe1,0x02,0x00,0x01,0x80
+
+# GFX90A: buffer_atomic_pk_add_f16 v0, v2, s[4:7], 0 idxen glc ; encoding: [0x00,0x60,0x38,0xe1,0x02,0x00,0x01,0x80]
+0x00,0x60,0x38,0xe1,0x02,0x00,0x01,0x80
+
+# GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc ; encoding: [0x00,0x80,0x35,0xdd,0x00,0x02,0x7f,0x00]
+0x00,0x80,0x35,0xdd,0x00,0x02,0x7f,0x00
+
+# GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc ; encoding: [0x00,0x80,0x39,0xdd,0x00,0x02,0x7f,0x00]
+0x00,0x80,0x39,0xdd,0x00,0x02,0x7f,0x00
+
+# GFX90A: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc ; encoding: [0x00,0x80,0x3d,0xdd,0x00,0x02,0x7f,0x00]
+0x00,0x80,0x3d,0xdd,0x00,0x02,0x7f,0x00
+
+# GFX90A: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc ; encoding: [0x00,0x80,0x45,0xdd,0x00,0x02,0x7f,0x00]
+0x00,0x80,0x45,0xdd,0x00,0x02,0x7f,0x00
+
+# GFX90A: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc ; encoding: [0x00,0x80,0x41,0xdd,0x00,0x02,0x7f,0x00]
+0x00,0x80,0x41,0xdd,0x00,0x02,0x7f,0x00
+
+# GFX90A: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc ; encoding: [0x00,0x00,0x3d,0xdd,0x00,0x02,0x00,0x00]
+0x00,0x00,0x3d,0xdd,0x00,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc ; encoding: [0x00,0x00,0x45,0xdd,0x00,0x02,0x00,0x00]
+0x00,0x00,0x45,0xdd,0x00,0x02,0x00,0x00
+
+# GFX90A: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc ; encoding: [0x00,0x00,0x41,0xdd,0x00,0x02,0x00,0x00]
+0x00,0x00,0x41,0xdd,0x00,0x02,0x00,0x00

diff  --git a/llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt
new file mode 100644
index 000000000000..69f0b3930cd0
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt
@@ -0,0 +1,8395 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -disassemble -show-encoding %s | FileCheck --check-prefix=GFX90A %s
+
+# GFX90A: flat_load_ubyte a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x40,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ubyte a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdc,0x02,0x00,0x80,0xff]
+0xff,0x0f,0x40,0xdc,0x02,0x00,0x80,0xff
+
+# GFX90A: flat_load_ubyte a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdc,0xfe,0x00,0x80,0x05]
+0xff,0x0f,0x40,0xdc,0xfe,0x00,0x80,0x05
+
+# GFX90A: flat_load_ubyte a5, v[2:3]      ; encoding: [0x00,0x00,0x40,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x40,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ubyte a5, v[2:3]      ; encoding: [0x00,0x00,0x40,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x40,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ubyte a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x40,0xdc,0x02,0x00,0x80,0x05]
+0x07,0x00,0x40,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ubyte a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x41,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x41,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ubyte a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x42,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x42,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sbyte a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x44,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sbyte a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdc,0x02,0x00,0x80,0xff]
+0xff,0x0f,0x44,0xdc,0x02,0x00,0x80,0xff
+
+# GFX90A: flat_load_sbyte a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdc,0xfe,0x00,0x80,0x05]
+0xff,0x0f,0x44,0xdc,0xfe,0x00,0x80,0x05
+
+# GFX90A: flat_load_sbyte a5, v[2:3]      ; encoding: [0x00,0x00,0x44,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x44,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sbyte a5, v[2:3]      ; encoding: [0x00,0x00,0x44,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x44,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sbyte a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x44,0xdc,0x02,0x00,0x80,0x05]
+0x07,0x00,0x44,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sbyte a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x45,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x45,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sbyte a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x46,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x46,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ushort a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x48,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x48,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ushort a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x48,0xdc,0x02,0x00,0x80,0xff]
+0xff,0x0f,0x48,0xdc,0x02,0x00,0x80,0xff
+
+# GFX90A: flat_load_ushort a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x48,0xdc,0xfe,0x00,0x80,0x05]
+0xff,0x0f,0x48,0xdc,0xfe,0x00,0x80,0x05
+
+# GFX90A: flat_load_ushort a5, v[2:3]     ; encoding: [0x00,0x00,0x48,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x48,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ushort a5, v[2:3]     ; encoding: [0x00,0x00,0x48,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x48,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ushort a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x48,0xdc,0x02,0x00,0x80,0x05]
+0x07,0x00,0x48,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ushort a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x49,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x49,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ushort a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x4a,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x4a,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sshort a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x4c,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x4c,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sshort a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x4c,0xdc,0x02,0x00,0x80,0xff]
+0xff,0x0f,0x4c,0xdc,0x02,0x00,0x80,0xff
+
+# GFX90A: flat_load_sshort a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x4c,0xdc,0xfe,0x00,0x80,0x05]
+0xff,0x0f,0x4c,0xdc,0xfe,0x00,0x80,0x05
+
+# GFX90A: flat_load_sshort a5, v[2:3]     ; encoding: [0x00,0x00,0x4c,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x4c,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sshort a5, v[2:3]     ; encoding: [0x00,0x00,0x4c,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x4c,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sshort a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x4c,0xdc,0x02,0x00,0x80,0x05]
+0x07,0x00,0x4c,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sshort a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x4d,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x4d,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sshort a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x4e,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x4e,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_dword a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x50,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x50,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_dword a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x50,0xdc,0x02,0x00,0x80,0xff]
+0xff,0x0f,0x50,0xdc,0x02,0x00,0x80,0xff
+
+# GFX90A: flat_load_dword a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x50,0xdc,0xfe,0x00,0x80,0x05]
+0xff,0x0f,0x50,0xdc,0xfe,0x00,0x80,0x05
+
+# GFX90A: flat_load_dword a5, v[2:3]      ; encoding: [0x00,0x00,0x50,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x50,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_dword a5, v[2:3]      ; encoding: [0x00,0x00,0x50,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x50,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_dword a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x50,0xdc,0x02,0x00,0x80,0x05]
+0x07,0x00,0x50,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_dword a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x51,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x51,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_dword a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x52,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x52,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_dwordx2 a[6:7], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x54,0xdc,0x02,0x00,0x80,0x06]
+0xff,0x0f,0x54,0xdc,0x02,0x00,0x80,0x06
+
+# GFX90A: flat_load_dwordx2 a[254:255], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x54,0xdc,0x02,0x00,0x80,0xfe]
+0xff,0x0f,0x54,0xdc,0x02,0x00,0x80,0xfe
+
+# GFX90A: flat_load_dwordx2 a[6:7], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x54,0xdc,0xfe,0x00,0x80,0x06]
+0xff,0x0f,0x54,0xdc,0xfe,0x00,0x80,0x06
+
+# GFX90A: flat_load_dwordx2 a[6:7], v[2:3] ; encoding: [0x00,0x00,0x54,0xdc,0x02,0x00,0x80,0x06]
+0x00,0x00,0x54,0xdc,0x02,0x00,0x80,0x06
+
+# GFX90A: flat_load_dwordx2 a[6:7], v[2:3] ; encoding: [0x00,0x00,0x54,0xdc,0x02,0x00,0x80,0x06]
+0x00,0x00,0x54,0xdc,0x02,0x00,0x80,0x06
+
+# GFX90A: flat_load_dwordx2 a[6:7], v[2:3] offset:7 ; encoding: [0x07,0x00,0x54,0xdc,0x02,0x00,0x80,0x06]
+0x07,0x00,0x54,0xdc,0x02,0x00,0x80,0x06
+
+# GFX90A: flat_load_dwordx2 a[6:7], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x55,0xdc,0x02,0x00,0x80,0x06]
+0xff,0x0f,0x55,0xdc,0x02,0x00,0x80,0x06
+
+# GFX90A: flat_load_dwordx2 a[6:7], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x56,0xdc,0x02,0x00,0x80,0x06]
+0xff,0x0f,0x56,0xdc,0x02,0x00,0x80,0x06
+
+# GFX90A: flat_load_dwordx3 a[6:8], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x58,0xdc,0x02,0x00,0x80,0x06]
+0xff,0x0f,0x58,0xdc,0x02,0x00,0x80,0x06
+
+# GFX90A: flat_load_dwordx3 a[252:254], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x58,0xdc,0x02,0x00,0x80,0xfc]
+0xff,0x0f,0x58,0xdc,0x02,0x00,0x80,0xfc
+
+# GFX90A: flat_load_dwordx3 a[6:8], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x58,0xdc,0xfe,0x00,0x80,0x06]
+0xff,0x0f,0x58,0xdc,0xfe,0x00,0x80,0x06
+
+# GFX90A: flat_load_dwordx3 a[6:8], v[2:3] ; encoding: [0x00,0x00,0x58,0xdc,0x02,0x00,0x80,0x06]
+0x00,0x00,0x58,0xdc,0x02,0x00,0x80,0x06
+
+# GFX90A: flat_load_dwordx3 a[6:8], v[2:3] ; encoding: [0x00,0x00,0x58,0xdc,0x02,0x00,0x80,0x06]
+0x00,0x00,0x58,0xdc,0x02,0x00,0x80,0x06
+
+# GFX90A: flat_load_dwordx3 a[6:8], v[2:3] offset:7 ; encoding: [0x07,0x00,0x58,0xdc,0x02,0x00,0x80,0x06]
+0x07,0x00,0x58,0xdc,0x02,0x00,0x80,0x06
+
+# GFX90A: flat_load_dwordx3 a[6:8], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x59,0xdc,0x02,0x00,0x80,0x06]
+0xff,0x0f,0x59,0xdc,0x02,0x00,0x80,0x06
+
+# GFX90A: flat_load_dwordx3 a[6:8], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x5a,0xdc,0x02,0x00,0x80,0x06]
+0xff,0x0f,0x5a,0xdc,0x02,0x00,0x80,0x06
+
+# GFX90A: flat_load_dwordx4 a[6:9], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x5c,0xdc,0x02,0x00,0x80,0x06]
+0xff,0x0f,0x5c,0xdc,0x02,0x00,0x80,0x06
+
+# GFX90A: flat_load_dwordx4 a[252:255], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x5c,0xdc,0x02,0x00,0x80,0xfc]
+0xff,0x0f,0x5c,0xdc,0x02,0x00,0x80,0xfc
+
+# GFX90A: flat_load_dwordx4 a[6:9], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x5c,0xdc,0xfe,0x00,0x80,0x06]
+0xff,0x0f,0x5c,0xdc,0xfe,0x00,0x80,0x06
+
+# GFX90A: flat_load_dwordx4 a[6:9], v[2:3] ; encoding: [0x00,0x00,0x5c,0xdc,0x02,0x00,0x80,0x06]
+0x00,0x00,0x5c,0xdc,0x02,0x00,0x80,0x06
+
+# GFX90A: flat_load_dwordx4 a[6:9], v[2:3] ; encoding: [0x00,0x00,0x5c,0xdc,0x02,0x00,0x80,0x06]
+0x00,0x00,0x5c,0xdc,0x02,0x00,0x80,0x06
+
+# GFX90A: flat_load_dwordx4 a[6:9], v[2:3] offset:7 ; encoding: [0x07,0x00,0x5c,0xdc,0x02,0x00,0x80,0x06]
+0x07,0x00,0x5c,0xdc,0x02,0x00,0x80,0x06
+
+# GFX90A: flat_load_dwordx4 a[6:9], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x5d,0xdc,0x02,0x00,0x80,0x06]
+0xff,0x0f,0x5d,0xdc,0x02,0x00,0x80,0x06
+
+# GFX90A: flat_load_dwordx4 a[6:9], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x5e,0xdc,0x02,0x00,0x80,0x06]
+0xff,0x0f,0x5e,0xdc,0x02,0x00,0x80,0x06
+
+# GFX90A: flat_store_byte v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x60,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x60,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_byte v[254:255], a2 offset:4095 ; encoding: [0xff,0x0f,0x60,0xdc,0xfe,0x02,0x80,0x00]
+0xff,0x0f,0x60,0xdc,0xfe,0x02,0x80,0x00
+
+# GFX90A: flat_store_byte v[2:3], a255 offset:4095 ; encoding: [0xff,0x0f,0x60,0xdc,0x02,0xff,0x80,0x00]
+0xff,0x0f,0x60,0xdc,0x02,0xff,0x80,0x00
+
+# GFX90A: flat_store_byte v[2:3], a2      ; encoding: [0x00,0x00,0x60,0xdc,0x02,0x02,0x80,0x00]
+0x00,0x00,0x60,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_byte v[2:3], a2      ; encoding: [0x00,0x00,0x60,0xdc,0x02,0x02,0x80,0x00]
+0x00,0x00,0x60,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_byte v[2:3], a2 offset:7 ; encoding: [0x07,0x00,0x60,0xdc,0x02,0x02,0x80,0x00]
+0x07,0x00,0x60,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_byte v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x61,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x61,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_byte v[2:3], a2 offset:4095 slc ; encoding: [0xff,0x0f,0x62,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x62,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_byte_d16_hi v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x64,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x64,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_byte_d16_hi v[254:255], a2 offset:4095 ; encoding: [0xff,0x0f,0x64,0xdc,0xfe,0x02,0x80,0x00]
+0xff,0x0f,0x64,0xdc,0xfe,0x02,0x80,0x00
+
+# GFX90A: flat_store_byte_d16_hi v[2:3], a255 offset:4095 ; encoding: [0xff,0x0f,0x64,0xdc,0x02,0xff,0x80,0x00]
+0xff,0x0f,0x64,0xdc,0x02,0xff,0x80,0x00
+
+# GFX90A: flat_store_byte_d16_hi v[2:3], a2 ; encoding: [0x00,0x00,0x64,0xdc,0x02,0x02,0x80,0x00]
+0x00,0x00,0x64,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_byte_d16_hi v[2:3], a2 ; encoding: [0x00,0x00,0x64,0xdc,0x02,0x02,0x80,0x00]
+0x00,0x00,0x64,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_byte_d16_hi v[2:3], a2 offset:7 ; encoding: [0x07,0x00,0x64,0xdc,0x02,0x02,0x80,0x00]
+0x07,0x00,0x64,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_byte_d16_hi v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x65,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x65,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_byte_d16_hi v[2:3], a2 offset:4095 slc ; encoding: [0xff,0x0f,0x66,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x66,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_short v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x68,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x68,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_short v[254:255], a2 offset:4095 ; encoding: [0xff,0x0f,0x68,0xdc,0xfe,0x02,0x80,0x00]
+0xff,0x0f,0x68,0xdc,0xfe,0x02,0x80,0x00
+
+# GFX90A: flat_store_short v[2:3], a255 offset:4095 ; encoding: [0xff,0x0f,0x68,0xdc,0x02,0xff,0x80,0x00]
+0xff,0x0f,0x68,0xdc,0x02,0xff,0x80,0x00
+
+# GFX90A: flat_store_short v[2:3], a2     ; encoding: [0x00,0x00,0x68,0xdc,0x02,0x02,0x80,0x00]
+0x00,0x00,0x68,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_short v[2:3], a2     ; encoding: [0x00,0x00,0x68,0xdc,0x02,0x02,0x80,0x00]
+0x00,0x00,0x68,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_short v[2:3], a2 offset:7 ; encoding: [0x07,0x00,0x68,0xdc,0x02,0x02,0x80,0x00]
+0x07,0x00,0x68,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_short v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x69,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x69,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_short v[2:3], a2 offset:4095 slc ; encoding: [0xff,0x0f,0x6a,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x6a,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_short_d16_hi v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x6c,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_short_d16_hi v[254:255], a2 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xdc,0xfe,0x02,0x80,0x00]
+0xff,0x0f,0x6c,0xdc,0xfe,0x02,0x80,0x00
+
+# GFX90A: flat_store_short_d16_hi v[2:3], a255 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xdc,0x02,0xff,0x80,0x00]
+0xff,0x0f,0x6c,0xdc,0x02,0xff,0x80,0x00
+
+# GFX90A: flat_store_short_d16_hi v[2:3], a2 ; encoding: [0x00,0x00,0x6c,0xdc,0x02,0x02,0x80,0x00]
+0x00,0x00,0x6c,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_short_d16_hi v[2:3], a2 ; encoding: [0x00,0x00,0x6c,0xdc,0x02,0x02,0x80,0x00]
+0x00,0x00,0x6c,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_short_d16_hi v[2:3], a2 offset:7 ; encoding: [0x07,0x00,0x6c,0xdc,0x02,0x02,0x80,0x00]
+0x07,0x00,0x6c,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_short_d16_hi v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x6d,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x6d,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_short_d16_hi v[2:3], a2 offset:4095 slc ; encoding: [0xff,0x0f,0x6e,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x6e,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dword v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x70,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x70,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dword v[254:255], a2 offset:4095 ; encoding: [0xff,0x0f,0x70,0xdc,0xfe,0x02,0x80,0x00]
+0xff,0x0f,0x70,0xdc,0xfe,0x02,0x80,0x00
+
+# GFX90A: flat_store_dword v[2:3], a255 offset:4095 ; encoding: [0xff,0x0f,0x70,0xdc,0x02,0xff,0x80,0x00]
+0xff,0x0f,0x70,0xdc,0x02,0xff,0x80,0x00
+
+# GFX90A: flat_store_dword v[2:3], a2     ; encoding: [0x00,0x00,0x70,0xdc,0x02,0x02,0x80,0x00]
+0x00,0x00,0x70,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dword v[2:3], a2     ; encoding: [0x00,0x00,0x70,0xdc,0x02,0x02,0x80,0x00]
+0x00,0x00,0x70,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dword v[2:3], a2 offset:7 ; encoding: [0x07,0x00,0x70,0xdc,0x02,0x02,0x80,0x00]
+0x07,0x00,0x70,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dword v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x71,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x71,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dword v[2:3], a2 offset:4095 slc ; encoding: [0xff,0x0f,0x72,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x72,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dwordx2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x74,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x74,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dwordx2 v[254:255], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x74,0xdc,0xfe,0x02,0x80,0x00]
+0xff,0x0f,0x74,0xdc,0xfe,0x02,0x80,0x00
+
+# GFX90A: flat_store_dwordx2 v[2:3], a[254:255] offset:4095 ; encoding: [0xff,0x0f,0x74,0xdc,0x02,0xfe,0x80,0x00]
+0xff,0x0f,0x74,0xdc,0x02,0xfe,0x80,0x00
+
+# GFX90A: flat_store_dwordx2 v[2:3], a[2:3] ; encoding: [0x00,0x00,0x74,0xdc,0x02,0x02,0x80,0x00]
+0x00,0x00,0x74,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dwordx2 v[2:3], a[2:3] ; encoding: [0x00,0x00,0x74,0xdc,0x02,0x02,0x80,0x00]
+0x00,0x00,0x74,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dwordx2 v[2:3], a[2:3] offset:7 ; encoding: [0x07,0x00,0x74,0xdc,0x02,0x02,0x80,0x00]
+0x07,0x00,0x74,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dwordx2 v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x75,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x75,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dwordx2 v[2:3], a[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x76,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x76,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dwordx3 v[2:3], a[2:4] offset:4095 ; encoding: [0xff,0x0f,0x78,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x78,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dwordx3 v[254:255], a[2:4] offset:4095 ; encoding: [0xff,0x0f,0x78,0xdc,0xfe,0x02,0x80,0x00]
+0xff,0x0f,0x78,0xdc,0xfe,0x02,0x80,0x00
+
+# GFX90A: flat_store_dwordx3 v[2:3], a[252:254] offset:4095 ; encoding: [0xff,0x0f,0x78,0xdc,0x02,0xfc,0x80,0x00]
+0xff,0x0f,0x78,0xdc,0x02,0xfc,0x80,0x00
+
+# GFX90A: flat_store_dwordx3 v[2:3], a[2:4] ; encoding: [0x00,0x00,0x78,0xdc,0x02,0x02,0x80,0x00]
+0x00,0x00,0x78,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dwordx3 v[2:3], a[2:4] ; encoding: [0x00,0x00,0x78,0xdc,0x02,0x02,0x80,0x00]
+0x00,0x00,0x78,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dwordx3 v[2:3], a[2:4] offset:7 ; encoding: [0x07,0x00,0x78,0xdc,0x02,0x02,0x80,0x00]
+0x07,0x00,0x78,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dwordx3 v[2:3], a[2:4] offset:4095 glc ; encoding: [0xff,0x0f,0x79,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x79,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dwordx3 v[2:3], a[2:4] offset:4095 slc ; encoding: [0xff,0x0f,0x7a,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x7a,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dwordx4 v[2:3], a[2:5] offset:4095 ; encoding: [0xff,0x0f,0x7c,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x7c,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dwordx4 v[254:255], a[2:5] offset:4095 ; encoding: [0xff,0x0f,0x7c,0xdc,0xfe,0x02,0x80,0x00]
+0xff,0x0f,0x7c,0xdc,0xfe,0x02,0x80,0x00
+
+# GFX90A: flat_store_dwordx4 v[2:3], a[252:255] offset:4095 ; encoding: [0xff,0x0f,0x7c,0xdc,0x02,0xfc,0x80,0x00]
+0xff,0x0f,0x7c,0xdc,0x02,0xfc,0x80,0x00
+
+# GFX90A: flat_store_dwordx4 v[2:3], a[2:5] ; encoding: [0x00,0x00,0x7c,0xdc,0x02,0x02,0x80,0x00]
+0x00,0x00,0x7c,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dwordx4 v[2:3], a[2:5] ; encoding: [0x00,0x00,0x7c,0xdc,0x02,0x02,0x80,0x00]
+0x00,0x00,0x7c,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dwordx4 v[2:3], a[2:5] offset:7 ; encoding: [0x07,0x00,0x7c,0xdc,0x02,0x02,0x80,0x00]
+0x07,0x00,0x7c,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dwordx4 v[2:3], a[2:5] offset:4095 glc ; encoding: [0xff,0x0f,0x7d,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x7d,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_store_dwordx4 v[2:3], a[2:5] offset:4095 slc ; encoding: [0xff,0x0f,0x7e,0xdc,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x7e,0xdc,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_load_ubyte_d16 a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x80,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x80,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ubyte_d16 a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x80,0xdc,0x02,0x00,0x80,0xff]
+0xff,0x0f,0x80,0xdc,0x02,0x00,0x80,0xff
+
+# GFX90A: flat_load_ubyte_d16 a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x80,0xdc,0xfe,0x00,0x80,0x05]
+0xff,0x0f,0x80,0xdc,0xfe,0x00,0x80,0x05
+
+# GFX90A: flat_load_ubyte_d16 a5, v[2:3]  ; encoding: [0x00,0x00,0x80,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x80,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ubyte_d16 a5, v[2:3]  ; encoding: [0x00,0x00,0x80,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x80,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ubyte_d16 a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x80,0xdc,0x02,0x00,0x80,0x05]
+0x07,0x00,0x80,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ubyte_d16 a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x81,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x81,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ubyte_d16 a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x82,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x82,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x84,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x84,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ubyte_d16_hi a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x84,0xdc,0x02,0x00,0x80,0xff]
+0xff,0x0f,0x84,0xdc,0x02,0x00,0x80,0xff
+
+# GFX90A: flat_load_ubyte_d16_hi a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x84,0xdc,0xfe,0x00,0x80,0x05]
+0xff,0x0f,0x84,0xdc,0xfe,0x00,0x80,0x05
+
+# GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x84,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x84,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x84,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x84,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x84,0xdc,0x02,0x00,0x80,0x05]
+0x07,0x00,0x84,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x85,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x85,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x86,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x86,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sbyte_d16 a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x88,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x88,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sbyte_d16 a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x88,0xdc,0x02,0x00,0x80,0xff]
+0xff,0x0f,0x88,0xdc,0x02,0x00,0x80,0xff
+
+# GFX90A: flat_load_sbyte_d16 a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x88,0xdc,0xfe,0x00,0x80,0x05]
+0xff,0x0f,0x88,0xdc,0xfe,0x00,0x80,0x05
+
+# GFX90A: flat_load_sbyte_d16 a5, v[2:3]  ; encoding: [0x00,0x00,0x88,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x88,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sbyte_d16 a5, v[2:3]  ; encoding: [0x00,0x00,0x88,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x88,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sbyte_d16 a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x88,0xdc,0x02,0x00,0x80,0x05]
+0x07,0x00,0x88,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sbyte_d16 a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x89,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x89,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sbyte_d16 a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x8a,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x8a,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x8c,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x8c,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sbyte_d16_hi a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x8c,0xdc,0x02,0x00,0x80,0xff]
+0xff,0x0f,0x8c,0xdc,0x02,0x00,0x80,0xff
+
+# GFX90A: flat_load_sbyte_d16_hi a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x8c,0xdc,0xfe,0x00,0x80,0x05]
+0xff,0x0f,0x8c,0xdc,0xfe,0x00,0x80,0x05
+
+# GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x8c,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x8c,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x8c,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x8c,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x8c,0xdc,0x02,0x00,0x80,0x05]
+0x07,0x00,0x8c,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x8d,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x8d,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x8e,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x8e,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_short_d16 a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x90,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x90,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_short_d16 a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x90,0xdc,0x02,0x00,0x80,0xff]
+0xff,0x0f,0x90,0xdc,0x02,0x00,0x80,0xff
+
+# GFX90A: flat_load_short_d16 a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x90,0xdc,0xfe,0x00,0x80,0x05]
+0xff,0x0f,0x90,0xdc,0xfe,0x00,0x80,0x05
+
+# GFX90A: flat_load_short_d16 a5, v[2:3]  ; encoding: [0x00,0x00,0x90,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x90,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_short_d16 a5, v[2:3]  ; encoding: [0x00,0x00,0x90,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x90,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_short_d16 a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x90,0xdc,0x02,0x00,0x80,0x05]
+0x07,0x00,0x90,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_short_d16 a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x91,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x91,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_short_d16 a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x92,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x92,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_short_d16_hi a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x94,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x94,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_short_d16_hi a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x94,0xdc,0x02,0x00,0x80,0xff]
+0xff,0x0f,0x94,0xdc,0x02,0x00,0x80,0xff
+
+# GFX90A: flat_load_short_d16_hi a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x94,0xdc,0xfe,0x00,0x80,0x05]
+0xff,0x0f,0x94,0xdc,0xfe,0x00,0x80,0x05
+
+# GFX90A: flat_load_short_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x94,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x94,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_short_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x94,0xdc,0x02,0x00,0x80,0x05]
+0x00,0x00,0x94,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_short_d16_hi a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x94,0xdc,0x02,0x00,0x80,0x05]
+0x07,0x00,0x94,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_short_d16_hi a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x95,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x95,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_load_short_d16_hi a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x96,0xdc,0x02,0x00,0x80,0x05]
+0xff,0x0f,0x96,0xdc,0x02,0x00,0x80,0x05
+
+# GFX90A: flat_atomic_swap a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x01,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x01,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_cmpswap a0, v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x05,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x05,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_add a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x09,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x09,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_sub a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x0d,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x0d,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_smin a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x11,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x11,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_umin a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x15,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x15,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_smax a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x19,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x19,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_umax a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x1d,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x1d,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_and a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x21,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x21,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_or a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x25,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x25,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_xor a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x29,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x29,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_inc a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x2d,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x2d,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_dec a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x31,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x31,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_swap_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x81,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x81,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_cmpswap_x2 a[0:1], v[2:3], a[2:5] offset:4095 glc ; encoding: [0xff,0x0f,0x85,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x85,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_add_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x89,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x89,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_sub_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x8d,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x8d,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_smin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x91,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x91,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_umin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x95,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x95,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_smax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x99,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x99,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_umax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x9d,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x9d,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_and_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa1,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0xa1,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_or_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa5,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0xa5,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_xor_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa9,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0xa9,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_inc_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xad,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0xad,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_dec_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xb1,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0xb1,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_swap v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x00,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x00,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_cmpswap v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x04,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x04,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_add v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x08,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x08,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_sub v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x0c,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_smin v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x10,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x10,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_umin v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x14,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x14,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_smax v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x18,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x18,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_umax v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x1c,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_and v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x20,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x20,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_or v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x24,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x24,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_xor v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x28,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x28,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_inc v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x2c,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_dec v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x30,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x30,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_swap_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x80,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x80,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_cmpswap_x2 v[2:3], a[2:5] offset:4095 ; encoding: [0xff,0x0f,0x84,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x84,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_add_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x88,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x88,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_sub_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x8c,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x8c,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_smin_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x90,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x90,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_umin_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x94,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x94,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_smax_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x98,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x98,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_umax_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x9c,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0x9c,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_and_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0xa0,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0xa0,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_or_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0xa4,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0xa4,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_xor_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0xa8,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0xa8,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_inc_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0xac,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0xac,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: flat_atomic_dec_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0xb0,0xdd,0x02,0x02,0x80,0x00]
+0xff,0x0f,0xb0,0xdd,0x02,0x02,0x80,0x00
+
+# GFX90A: global_load_ubyte a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x40,0xdc,0x02,0x00,0xff,0x05]
+0xff,0x9f,0x40,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_load_ubyte a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x40,0xdc,0x02,0x00,0xff,0xff]
+0xff,0x9f,0x40,0xdc,0x02,0x00,0xff,0xff
+
+# GFX90A: global_load_ubyte a5, v[2:3], off ; encoding: [0x00,0x80,0x40,0xdc,0x02,0x00,0xff,0x05]
+0x00,0x80,0x40,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_load_sbyte a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x44,0xdc,0x02,0x00,0xff,0x05]
+0xff,0x9f,0x44,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_load_sbyte a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x44,0xdc,0x02,0x00,0xff,0xff]
+0xff,0x9f,0x44,0xdc,0x02,0x00,0xff,0xff
+
+# GFX90A: global_load_sbyte a5, v[2:3], off ; encoding: [0x00,0x80,0x44,0xdc,0x02,0x00,0xff,0x05]
+0x00,0x80,0x44,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_load_ushort a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x48,0xdc,0x02,0x00,0xff,0x05]
+0xff,0x9f,0x48,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_load_ushort a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x48,0xdc,0x02,0x00,0xff,0xff]
+0xff,0x9f,0x48,0xdc,0x02,0x00,0xff,0xff
+
+# GFX90A: global_load_ushort a5, v[2:3], off ; encoding: [0x00,0x80,0x48,0xdc,0x02,0x00,0xff,0x05]
+0x00,0x80,0x48,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_load_sshort a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x4c,0xdc,0x02,0x00,0xff,0x05]
+0xff,0x9f,0x4c,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_load_sshort a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x4c,0xdc,0x02,0x00,0xff,0xff]
+0xff,0x9f,0x4c,0xdc,0x02,0x00,0xff,0xff
+
+# GFX90A: global_load_sshort a5, v[2:3], off ; encoding: [0x00,0x80,0x4c,0xdc,0x02,0x00,0xff,0x05]
+0x00,0x80,0x4c,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_load_dword a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x50,0xdc,0x02,0x00,0xff,0x05]
+0xff,0x9f,0x50,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_load_dword a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x50,0xdc,0x02,0x00,0xff,0xff]
+0xff,0x9f,0x50,0xdc,0x02,0x00,0xff,0xff
+
+# GFX90A: global_load_dword a5, v[2:3], off ; encoding: [0x00,0x80,0x50,0xdc,0x02,0x00,0xff,0x05]
+0x00,0x80,0x50,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_load_dwordx2 a[6:7], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x54,0xdc,0x02,0x00,0xff,0x06]
+0xff,0x9f,0x54,0xdc,0x02,0x00,0xff,0x06
+
+# GFX90A: global_load_dwordx2 a[254:255], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x54,0xdc,0x02,0x00,0xff,0xfe]
+0xff,0x9f,0x54,0xdc,0x02,0x00,0xff,0xfe
+
+# GFX90A: global_load_dwordx2 a[6:7], v[2:3], off ; encoding: [0x00,0x80,0x54,0xdc,0x02,0x00,0xff,0x06]
+0x00,0x80,0x54,0xdc,0x02,0x00,0xff,0x06
+
+# GFX90A: global_load_dwordx3 a[6:8], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x58,0xdc,0x02,0x00,0xff,0x06]
+0xff,0x9f,0x58,0xdc,0x02,0x00,0xff,0x06
+
+# GFX90A: global_load_dwordx3 a[252:254], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x58,0xdc,0x02,0x00,0xff,0xfc]
+0xff,0x9f,0x58,0xdc,0x02,0x00,0xff,0xfc
+
+# GFX90A: global_load_dwordx3 a[6:8], v[2:3], off ; encoding: [0x00,0x80,0x58,0xdc,0x02,0x00,0xff,0x06]
+0x00,0x80,0x58,0xdc,0x02,0x00,0xff,0x06
+
+# GFX90A: global_load_dwordx4 a[6:9], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x5c,0xdc,0x02,0x00,0xff,0x06]
+0xff,0x9f,0x5c,0xdc,0x02,0x00,0xff,0x06
+
+# GFX90A: global_load_dwordx4 a[252:255], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x5c,0xdc,0x02,0x00,0xff,0xfc]
+0xff,0x9f,0x5c,0xdc,0x02,0x00,0xff,0xfc
+
+# GFX90A: global_load_dwordx4 a[6:9], v[2:3], off ; encoding: [0x00,0x80,0x5c,0xdc,0x02,0x00,0xff,0x06]
+0x00,0x80,0x5c,0xdc,0x02,0x00,0xff,0x06
+
+# GFX90A: global_store_byte v[2:3], a2, off offset:-1 ; encoding: [0xff,0x9f,0x60,0xdc,0x02,0x02,0xff,0x00]
+0xff,0x9f,0x60,0xdc,0x02,0x02,0xff,0x00
+
+# GFX90A: global_store_byte v[2:3], a255, off offset:-1 ; encoding: [0xff,0x9f,0x60,0xdc,0x02,0xff,0xff,0x00]
+0xff,0x9f,0x60,0xdc,0x02,0xff,0xff,0x00
+
+# GFX90A: global_store_byte v[2:3], a2, off ; encoding: [0x00,0x80,0x60,0xdc,0x02,0x02,0xff,0x00]
+0x00,0x80,0x60,0xdc,0x02,0x02,0xff,0x00
+
+# GFX90A: global_store_byte_d16_hi v[2:3], a2, off offset:-1 ; encoding: [0xff,0x9f,0x64,0xdc,0x02,0x02,0xff,0x00]
+0xff,0x9f,0x64,0xdc,0x02,0x02,0xff,0x00
+
+# GFX90A: global_store_byte_d16_hi v[2:3], a255, off offset:-1 ; encoding: [0xff,0x9f,0x64,0xdc,0x02,0xff,0xff,0x00]
+0xff,0x9f,0x64,0xdc,0x02,0xff,0xff,0x00
+
+# GFX90A: global_store_byte_d16_hi v[2:3], a2, off ; encoding: [0x00,0x80,0x64,0xdc,0x02,0x02,0xff,0x00]
+0x00,0x80,0x64,0xdc,0x02,0x02,0xff,0x00
+
+# GFX90A: global_store_short v[2:3], a2, off offset:-1 ; encoding: [0xff,0x9f,0x68,0xdc,0x02,0x02,0xff,0x00]
+0xff,0x9f,0x68,0xdc,0x02,0x02,0xff,0x00
+
+# GFX90A: global_store_short v[2:3], a255, off offset:-1 ; encoding: [0xff,0x9f,0x68,0xdc,0x02,0xff,0xff,0x00]
+0xff,0x9f,0x68,0xdc,0x02,0xff,0xff,0x00
+
+# GFX90A: global_store_short v[2:3], a2, off ; encoding: [0x00,0x80,0x68,0xdc,0x02,0x02,0xff,0x00]
+0x00,0x80,0x68,0xdc,0x02,0x02,0xff,0x00
+
+# GFX90A: global_store_short_d16_hi v[2:3], a2, off offset:-1 ; encoding: [0xff,0x9f,0x6c,0xdc,0x02,0x02,0xff,0x00]
+0xff,0x9f,0x6c,0xdc,0x02,0x02,0xff,0x00
+
+# GFX90A: global_store_short_d16_hi v[2:3], a255, off offset:-1 ; encoding: [0xff,0x9f,0x6c,0xdc,0x02,0xff,0xff,0x00]
+0xff,0x9f,0x6c,0xdc,0x02,0xff,0xff,0x00
+
+# GFX90A: global_store_short_d16_hi v[2:3], a2, off ; encoding: [0x00,0x80,0x6c,0xdc,0x02,0x02,0xff,0x00]
+0x00,0x80,0x6c,0xdc,0x02,0x02,0xff,0x00
+
+# GFX90A: global_store_dword v[2:3], a2, off offset:-1 ; encoding: [0xff,0x9f,0x70,0xdc,0x02,0x02,0xff,0x00]
+0xff,0x9f,0x70,0xdc,0x02,0x02,0xff,0x00
+
+# GFX90A: global_store_dword v[2:3], a255, off offset:-1 ; encoding: [0xff,0x9f,0x70,0xdc,0x02,0xff,0xff,0x00]
+0xff,0x9f,0x70,0xdc,0x02,0xff,0xff,0x00
+
+# GFX90A: global_store_dword v[2:3], a2, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x02,0xff,0x00]
+0x00,0x80,0x70,0xdc,0x02,0x02,0xff,0x00
+
+# GFX90A: global_store_dwordx2 v[2:3], a[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x74,0xdc,0x02,0x02,0xff,0x00]
+0xff,0x9f,0x74,0xdc,0x02,0x02,0xff,0x00
+
+# GFX90A: global_store_dwordx2 v[2:3], a[254:255], off offset:-1 ; encoding: [0xff,0x9f,0x74,0xdc,0x02,0xfe,0xff,0x00]
+0xff,0x9f,0x74,0xdc,0x02,0xfe,0xff,0x00
+
+# GFX90A: global_store_dwordx2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x02,0xff,0x00]
+0x00,0x80,0x74,0xdc,0x02,0x02,0xff,0x00
+
+# GFX90A: global_store_dwordx3 v[2:3], a[2:4], off offset:-1 ; encoding: [0xff,0x9f,0x78,0xdc,0x02,0x02,0xff,0x00]
+0xff,0x9f,0x78,0xdc,0x02,0x02,0xff,0x00
+
+# GFX90A: global_store_dwordx3 v[2:3], a[252:254], off offset:-1 ; encoding: [0xff,0x9f,0x78,0xdc,0x02,0xfc,0xff,0x00]
+0xff,0x9f,0x78,0xdc,0x02,0xfc,0xff,0x00
+
+# GFX90A: global_store_dwordx3 v[2:3], a[2:4], off ; encoding: [0x00,0x80,0x78,0xdc,0x02,0x02,0xff,0x00]
+0x00,0x80,0x78,0xdc,0x02,0x02,0xff,0x00
+
+# GFX90A: global_store_dwordx4 v[2:3], a[2:5], off offset:-1 ; encoding: [0xff,0x9f,0x7c,0xdc,0x02,0x02,0xff,0x00]
+0xff,0x9f,0x7c,0xdc,0x02,0x02,0xff,0x00
+
+# GFX90A: global_store_dwordx4 v[2:3], a[252:255], off offset:-1 ; encoding: [0xff,0x9f,0x7c,0xdc,0x02,0xfc,0xff,0x00]
+0xff,0x9f,0x7c,0xdc,0x02,0xfc,0xff,0x00
+
+# GFX90A: global_store_dwordx4 v[2:3], a[2:5], off ; encoding: [0x00,0x80,0x7c,0xdc,0x02,0x02,0xff,0x00]
+0x00,0x80,0x7c,0xdc,0x02,0x02,0xff,0x00
+
+# GFX90A: global_load_ubyte_d16 a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x80,0xdc,0x02,0x00,0xff,0x05]
+0xff,0x9f,0x80,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_load_ubyte_d16 a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x80,0xdc,0x02,0x00,0xff,0xff]
+0xff,0x9f,0x80,0xdc,0x02,0x00,0xff,0xff
+
+# GFX90A: global_load_ubyte_d16 a5, v[2:3], off ; encoding: [0x00,0x80,0x80,0xdc,0x02,0x00,0xff,0x05]
+0x00,0x80,0x80,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_load_ubyte_d16_hi a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x84,0xdc,0x02,0x00,0xff,0x05]
+0xff,0x9f,0x84,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_load_ubyte_d16_hi a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x84,0xdc,0x02,0x00,0xff,0xff]
+0xff,0x9f,0x84,0xdc,0x02,0x00,0xff,0xff
+
+# GFX90A: global_load_ubyte_d16_hi a5, v[2:3], off ; encoding: [0x00,0x80,0x84,0xdc,0x02,0x00,0xff,0x05]
+0x00,0x80,0x84,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_load_sbyte_d16 a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x88,0xdc,0x02,0x00,0xff,0x05]
+0xff,0x9f,0x88,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_load_sbyte_d16 a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x88,0xdc,0x02,0x00,0xff,0xff]
+0xff,0x9f,0x88,0xdc,0x02,0x00,0xff,0xff
+
+# GFX90A: global_load_sbyte_d16 a5, v[2:3], off ; encoding: [0x00,0x80,0x88,0xdc,0x02,0x00,0xff,0x05]
+0x00,0x80,0x88,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_load_sbyte_d16_hi a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x8c,0xdc,0x02,0x00,0xff,0x05]
+0xff,0x9f,0x8c,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_load_sbyte_d16_hi a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x8c,0xdc,0x02,0x00,0xff,0xff]
+0xff,0x9f,0x8c,0xdc,0x02,0x00,0xff,0xff
+
+# GFX90A: global_load_sbyte_d16_hi a5, v[2:3], off ; encoding: [0x00,0x80,0x8c,0xdc,0x02,0x00,0xff,0x05]
+0x00,0x80,0x8c,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_load_short_d16 a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x90,0xdc,0x02,0x00,0xff,0x05]
+0xff,0x9f,0x90,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_load_short_d16 a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x90,0xdc,0x02,0x00,0xff,0xff]
+0xff,0x9f,0x90,0xdc,0x02,0x00,0xff,0xff
+
+# GFX90A: global_load_short_d16 a5, v[2:3], off ; encoding: [0x00,0x80,0x90,0xdc,0x02,0x00,0xff,0x05]
+0x00,0x80,0x90,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_load_short_d16_hi a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x94,0xdc,0x02,0x00,0xff,0x05]
+0xff,0x9f,0x94,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_load_short_d16_hi a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x94,0xdc,0x02,0x00,0xff,0xff]
+0xff,0x9f,0x94,0xdc,0x02,0x00,0xff,0xff
+
+# GFX90A: global_load_short_d16_hi a5, v[2:3], off ; encoding: [0x00,0x80,0x94,0xdc,0x02,0x00,0xff,0x05]
+0x00,0x80,0x94,0xdc,0x02,0x00,0xff,0x05
+
+# GFX90A: global_atomic_swap a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x01,0xdd,0x02,0x02,0xff,0x01]
+0x00,0x80,0x01,0xdd,0x02,0x02,0xff,0x01
+
+# GFX90A: global_atomic_cmpswap a1, v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x05,0xdd,0x02,0x02,0xff,0x01]
+0x00,0x80,0x05,0xdd,0x02,0x02,0xff,0x01
+
+# GFX90A: global_atomic_add a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x09,0xdd,0x02,0x02,0xff,0x01]
+0x00,0x80,0x09,0xdd,0x02,0x02,0xff,0x01
+
+# GFX90A: global_atomic_sub a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x0d,0xdd,0x02,0x02,0xff,0x01]
+0x00,0x80,0x0d,0xdd,0x02,0x02,0xff,0x01
+
+# GFX90A: global_atomic_smin a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x11,0xdd,0x02,0x02,0xff,0x01]
+0x00,0x80,0x11,0xdd,0x02,0x02,0xff,0x01
+
+# GFX90A: global_atomic_umin a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x15,0xdd,0x02,0x02,0xff,0x01]
+0x00,0x80,0x15,0xdd,0x02,0x02,0xff,0x01
+
+# GFX90A: global_atomic_smax a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x19,0xdd,0x02,0x02,0xff,0x01]
+0x00,0x80,0x19,0xdd,0x02,0x02,0xff,0x01
+
+# GFX90A: global_atomic_umax a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x1d,0xdd,0x02,0x02,0xff,0x01]
+0x00,0x80,0x1d,0xdd,0x02,0x02,0xff,0x01
+
+# GFX90A: global_atomic_and a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x21,0xdd,0x02,0x02,0xff,0x01]
+0x00,0x80,0x21,0xdd,0x02,0x02,0xff,0x01
+
+# GFX90A: global_atomic_or a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x25,0xdd,0x02,0x02,0xff,0x01]
+0x00,0x80,0x25,0xdd,0x02,0x02,0xff,0x01
+
+# GFX90A: global_atomic_xor a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x29,0xdd,0x02,0x02,0xff,0x01]
+0x00,0x80,0x29,0xdd,0x02,0x02,0xff,0x01
+
+# GFX90A: global_atomic_inc a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x2d,0xdd,0x02,0x02,0xff,0x01]
+0x00,0x80,0x2d,0xdd,0x02,0x02,0xff,0x01
+
+# GFX90A: global_atomic_dec a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x31,0xdd,0x02,0x02,0xff,0x01]
+0x00,0x80,0x31,0xdd,0x02,0x02,0xff,0x01
+
+# GFX90A: global_atomic_swap_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x81,0xdd,0x02,0x02,0xff,0x02]
+0x00,0x80,0x81,0xdd,0x02,0x02,0xff,0x02
+
+# GFX90A: global_atomic_cmpswap_x2 a[2:3], v[2:3], a[2:5], off glc ; encoding: [0x00,0x80,0x85,0xdd,0x02,0x02,0xff,0x02]
+0x00,0x80,0x85,0xdd,0x02,0x02,0xff,0x02
+
+# GFX90A: global_atomic_add_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x89,0xdd,0x02,0x02,0xff,0x02]
+0x00,0x80,0x89,0xdd,0x02,0x02,0xff,0x02
+
+# GFX90A: global_atomic_sub_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x8d,0xdd,0x02,0x02,0xff,0x02]
+0x00,0x80,0x8d,0xdd,0x02,0x02,0xff,0x02
+
+# GFX90A: global_atomic_smin_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x91,0xdd,0x02,0x02,0xff,0x02]
+0x00,0x80,0x91,0xdd,0x02,0x02,0xff,0x02
+
+# GFX90A: global_atomic_umin_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x95,0xdd,0x02,0x02,0xff,0x02]
+0x00,0x80,0x95,0xdd,0x02,0x02,0xff,0x02
+
+# GFX90A: global_atomic_smax_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x99,0xdd,0x02,0x02,0xff,0x02]
+0x00,0x80,0x99,0xdd,0x02,0x02,0xff,0x02
+
+# GFX90A: global_atomic_umax_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x9d,0xdd,0x02,0x02,0xff,0x02]
+0x00,0x80,0x9d,0xdd,0x02,0x02,0xff,0x02
+
+# GFX90A: global_atomic_and_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0xa1,0xdd,0x02,0x02,0xff,0x02]
+0x00,0x80,0xa1,0xdd,0x02,0x02,0xff,0x02
+
+# GFX90A: global_atomic_or_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0xa5,0xdd,0x02,0x02,0xff,0x02]
+0x00,0x80,0xa5,0xdd,0x02,0x02,0xff,0x02
+
+# GFX90A: global_atomic_xor_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0xa9,0xdd,0x02,0x02,0xff,0x02]
+0x00,0x80,0xa9,0xdd,0x02,0x02,0xff,0x02
+
+# GFX90A: global_atomic_inc_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0xad,0xdd,0x02,0x02,0xff,0x02]
+0x00,0x80,0xad,0xdd,0x02,0x02,0xff,0x02
+
+# GFX90A: global_atomic_dec_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0xb1,0xdd,0x02,0x02,0xff,0x02]
+0x00,0x80,0xb1,0xdd,0x02,0x02,0xff,0x02
+
+# GFX90A: global_atomic_swap v[2:3], a2, off ; encoding: [0x00,0x80,0x00,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0x00,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_cmpswap v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x04,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0x04,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_add v[2:3], a2, off ; encoding: [0x00,0x80,0x08,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0x08,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_sub v[2:3], a2, off ; encoding: [0x00,0x80,0x0c,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0x0c,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_smin v[2:3], a2, off ; encoding: [0x00,0x80,0x10,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0x10,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_umin v[2:3], a2, off ; encoding: [0x00,0x80,0x14,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0x14,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_smax v[2:3], a2, off ; encoding: [0x00,0x80,0x18,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0x18,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_umax v[2:3], a2, off ; encoding: [0x00,0x80,0x1c,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0x1c,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_and v[2:3], a2, off ; encoding: [0x00,0x80,0x20,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0x20,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_or v[2:3], a2, off ; encoding: [0x00,0x80,0x24,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0x24,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_xor v[2:3], a2, off ; encoding: [0x00,0x80,0x28,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0x28,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_inc v[2:3], a2, off ; encoding: [0x00,0x80,0x2c,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0x2c,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_dec v[2:3], a2, off ; encoding: [0x00,0x80,0x30,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0x30,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_swap_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x80,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0x80,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_cmpswap_x2 v[2:3], a[2:5], off ; encoding: [0x00,0x80,0x84,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0x84,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_add_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x88,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0x88,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_sub_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x8c,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0x8c,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_smin_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x90,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0x90,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_umin_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x94,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0x94,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_smax_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x98,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0x98,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_umax_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x9c,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0x9c,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_and_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0xa0,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0xa0,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_or_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0xa4,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0xa4,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_xor_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0xa8,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0xa8,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_inc_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0xac,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0xac,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: global_atomic_dec_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0xb0,0xdd,0x02,0x02,0xff,0x00]
+0x00,0x80,0xb0,0xdd,0x02,0x02,0xff,0x00
+
+# GFX90A: scratch_load_ubyte a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ubyte a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0xff]
+0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0xff
+
+# GFX90A: scratch_load_ubyte a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xe5,0x05]
+0xff,0x5f,0x40,0xdc,0x00,0x00,0xe5,0x05
+
+# GFX90A: scratch_load_ubyte a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xe6,0x05]
+0xff,0x5f,0x40,0xdc,0x00,0x00,0xe6,0x05
+
+# GFX90A: scratch_load_ubyte a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xe7,0x05]
+0xff,0x5f,0x40,0xdc,0x00,0x00,0xe7,0x05
+
+# GFX90A: scratch_load_ubyte a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xea,0x05]
+0xff,0x5f,0x40,0xdc,0x00,0x00,0xea,0x05
+
+# GFX90A: scratch_load_ubyte a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xeb,0x05]
+0xff,0x5f,0x40,0xdc,0x00,0x00,0xeb,0x05
+
+# GFX90A: scratch_load_ubyte a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xff,0x05]
+0xff,0x5f,0x40,0xdc,0x00,0x00,0xff,0x05
+
+# GFX90A: scratch_load_ubyte a5, off, s2  ; encoding: [0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ubyte a5, off, s2  ; encoding: [0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ubyte a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x40,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x4f,0x40,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ubyte a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x40,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x50,0x40,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ubyte a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x41,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x41,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ubyte a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x42,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x42,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sbyte a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sbyte a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0xff]
+0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0xff
+
+# GFX90A: scratch_load_sbyte a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xe5,0x05]
+0xff,0x5f,0x44,0xdc,0x00,0x00,0xe5,0x05
+
+# GFX90A: scratch_load_sbyte a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xe6,0x05]
+0xff,0x5f,0x44,0xdc,0x00,0x00,0xe6,0x05
+
+# GFX90A: scratch_load_sbyte a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xe7,0x05]
+0xff,0x5f,0x44,0xdc,0x00,0x00,0xe7,0x05
+
+# GFX90A: scratch_load_sbyte a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xea,0x05]
+0xff,0x5f,0x44,0xdc,0x00,0x00,0xea,0x05
+
+# GFX90A: scratch_load_sbyte a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xeb,0x05]
+0xff,0x5f,0x44,0xdc,0x00,0x00,0xeb,0x05
+
+# GFX90A: scratch_load_sbyte a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xff,0x05]
+0xff,0x5f,0x44,0xdc,0x00,0x00,0xff,0x05
+
+# GFX90A: scratch_load_sbyte a5, off, s2  ; encoding: [0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sbyte a5, off, s2  ; encoding: [0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sbyte a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x44,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x4f,0x44,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sbyte a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x44,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x50,0x44,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sbyte a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x45,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x45,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sbyte a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x46,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x46,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ushort a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ushort a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0xff]
+0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0xff
+
+# GFX90A: scratch_load_ushort a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xe5,0x05]
+0xff,0x5f,0x48,0xdc,0x00,0x00,0xe5,0x05
+
+# GFX90A: scratch_load_ushort a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xe6,0x05]
+0xff,0x5f,0x48,0xdc,0x00,0x00,0xe6,0x05
+
+# GFX90A: scratch_load_ushort a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xe7,0x05]
+0xff,0x5f,0x48,0xdc,0x00,0x00,0xe7,0x05
+
+# GFX90A: scratch_load_ushort a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xea,0x05]
+0xff,0x5f,0x48,0xdc,0x00,0x00,0xea,0x05
+
+# GFX90A: scratch_load_ushort a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xeb,0x05]
+0xff,0x5f,0x48,0xdc,0x00,0x00,0xeb,0x05
+
+# GFX90A: scratch_load_ushort a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xff,0x05]
+0xff,0x5f,0x48,0xdc,0x00,0x00,0xff,0x05
+
+# GFX90A: scratch_load_ushort a5, off, s2 ; encoding: [0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ushort a5, off, s2 ; encoding: [0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ushort a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x48,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x4f,0x48,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ushort a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x48,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x50,0x48,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ushort a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x49,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x49,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ushort a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x4a,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x4a,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sshort a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sshort a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0xff]
+0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0xff
+
+# GFX90A: scratch_load_sshort a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xe5,0x05]
+0xff,0x5f,0x4c,0xdc,0x00,0x00,0xe5,0x05
+
+# GFX90A: scratch_load_sshort a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xe6,0x05]
+0xff,0x5f,0x4c,0xdc,0x00,0x00,0xe6,0x05
+
+# GFX90A: scratch_load_sshort a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xe7,0x05]
+0xff,0x5f,0x4c,0xdc,0x00,0x00,0xe7,0x05
+
+# GFX90A: scratch_load_sshort a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xea,0x05]
+0xff,0x5f,0x4c,0xdc,0x00,0x00,0xea,0x05
+
+# GFX90A: scratch_load_sshort a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xeb,0x05]
+0xff,0x5f,0x4c,0xdc,0x00,0x00,0xeb,0x05
+
+# GFX90A: scratch_load_sshort a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xff,0x05]
+0xff,0x5f,0x4c,0xdc,0x00,0x00,0xff,0x05
+
+# GFX90A: scratch_load_sshort a5, off, s2 ; encoding: [0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sshort a5, off, s2 ; encoding: [0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sshort a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x4c,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x4f,0x4c,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sshort a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x4c,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x50,0x4c,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sshort a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x4d,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x4d,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sshort a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x4e,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x4e,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_dword a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_dword a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0xff]
+0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0xff
+
+# GFX90A: scratch_load_dword a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xe5,0x05]
+0xff,0x5f,0x50,0xdc,0x00,0x00,0xe5,0x05
+
+# GFX90A: scratch_load_dword a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xe6,0x05]
+0xff,0x5f,0x50,0xdc,0x00,0x00,0xe6,0x05
+
+# GFX90A: scratch_load_dword a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xe7,0x05]
+0xff,0x5f,0x50,0xdc,0x00,0x00,0xe7,0x05
+
+# GFX90A: scratch_load_dword a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xea,0x05]
+0xff,0x5f,0x50,0xdc,0x00,0x00,0xea,0x05
+
+# GFX90A: scratch_load_dword a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xeb,0x05]
+0xff,0x5f,0x50,0xdc,0x00,0x00,0xeb,0x05
+
+# GFX90A: scratch_load_dword a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xff,0x05]
+0xff,0x5f,0x50,0xdc,0x00,0x00,0xff,0x05
+
+# GFX90A: scratch_load_dword a5, off, s2  ; encoding: [0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_dword a5, off, s2  ; encoding: [0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_dword a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x50,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x4f,0x50,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_dword a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x50,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x50,0x50,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_dword a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x51,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x51,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_dword a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x52,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x52,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_dwordx2 a[6:7], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0x06]
+0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0x06
+
+# GFX90A: scratch_load_dwordx2 a[254:255], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0xfe]
+0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0xfe
+
+# GFX90A: scratch_load_dwordx2 a[6:7], off, s101 offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xe5,0x06]
+0xff,0x5f,0x54,0xdc,0x00,0x00,0xe5,0x06
+
+# GFX90A: scratch_load_dwordx2 a[6:7], off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xe6,0x06]
+0xff,0x5f,0x54,0xdc,0x00,0x00,0xe6,0x06
+
+# GFX90A: scratch_load_dwordx2 a[6:7], off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xe7,0x06]
+0xff,0x5f,0x54,0xdc,0x00,0x00,0xe7,0x06
+
+# GFX90A: scratch_load_dwordx2 a[6:7], off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xea,0x06]
+0xff,0x5f,0x54,0xdc,0x00,0x00,0xea,0x06
+
+# GFX90A: scratch_load_dwordx2 a[6:7], off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xeb,0x06]
+0xff,0x5f,0x54,0xdc,0x00,0x00,0xeb,0x06
+
+# GFX90A: scratch_load_dwordx2 a[6:7], v0, off offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xff,0x06]
+0xff,0x5f,0x54,0xdc,0x00,0x00,0xff,0x06
+
+# GFX90A: scratch_load_dwordx2 a[6:7], off, s2 ; encoding: [0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x06]
+0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x06
+
+# GFX90A: scratch_load_dwordx2 a[6:7], off, s2 ; encoding: [0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x06]
+0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x06
+
+# GFX90A: scratch_load_dwordx2 a[6:7], off, s2 offset:4095 ; encoding: [0xff,0x4f,0x54,0xdc,0x00,0x00,0x82,0x06]
+0xff,0x4f,0x54,0xdc,0x00,0x00,0x82,0x06
+
+# GFX90A: scratch_load_dwordx2 a[6:7], off, s2 offset:-4096 ; encoding: [0x00,0x50,0x54,0xdc,0x00,0x00,0x82,0x06]
+0x00,0x50,0x54,0xdc,0x00,0x00,0x82,0x06
+
+# GFX90A: scratch_load_dwordx2 a[6:7], off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x55,0xdc,0x00,0x00,0x82,0x06]
+0xff,0x5f,0x55,0xdc,0x00,0x00,0x82,0x06
+
+# GFX90A: scratch_load_dwordx2 a[6:7], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x56,0xdc,0x00,0x00,0x82,0x06]
+0xff,0x5f,0x56,0xdc,0x00,0x00,0x82,0x06
+
+# GFX90A: scratch_load_dwordx3 a[6:8], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0x06]
+0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0x06
+
+# GFX90A: scratch_load_dwordx3 a[252:254], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0xfc]
+0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0xfc
+
+# GFX90A: scratch_load_dwordx3 a[6:8], off, s101 offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xe5,0x06]
+0xff,0x5f,0x58,0xdc,0x00,0x00,0xe5,0x06
+
+# GFX90A: scratch_load_dwordx3 a[6:8], off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xe6,0x06]
+0xff,0x5f,0x58,0xdc,0x00,0x00,0xe6,0x06
+
+# GFX90A: scratch_load_dwordx3 a[6:8], off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xe7,0x06]
+0xff,0x5f,0x58,0xdc,0x00,0x00,0xe7,0x06
+
+# GFX90A: scratch_load_dwordx3 a[6:8], off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xea,0x06]
+0xff,0x5f,0x58,0xdc,0x00,0x00,0xea,0x06
+
+# GFX90A: scratch_load_dwordx3 a[6:8], off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xeb,0x06]
+0xff,0x5f,0x58,0xdc,0x00,0x00,0xeb,0x06
+
+# GFX90A: scratch_load_dwordx3 a[6:8], v0, off offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xff,0x06]
+0xff,0x5f,0x58,0xdc,0x00,0x00,0xff,0x06
+
+# GFX90A: scratch_load_dwordx3 a[6:8], off, s2 ; encoding: [0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x06]
+0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x06
+
+# GFX90A: scratch_load_dwordx3 a[6:8], off, s2 ; encoding: [0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x06]
+0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x06
+
+# GFX90A: scratch_load_dwordx3 a[6:8], off, s2 offset:4095 ; encoding: [0xff,0x4f,0x58,0xdc,0x00,0x00,0x82,0x06]
+0xff,0x4f,0x58,0xdc,0x00,0x00,0x82,0x06
+
+# GFX90A: scratch_load_dwordx3 a[6:8], off, s2 offset:-4096 ; encoding: [0x00,0x50,0x58,0xdc,0x00,0x00,0x82,0x06]
+0x00,0x50,0x58,0xdc,0x00,0x00,0x82,0x06
+
+# GFX90A: scratch_load_dwordx3 a[6:8], off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x59,0xdc,0x00,0x00,0x82,0x06]
+0xff,0x5f,0x59,0xdc,0x00,0x00,0x82,0x06
+
+# GFX90A: scratch_load_dwordx3 a[6:8], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x5a,0xdc,0x00,0x00,0x82,0x06]
+0xff,0x5f,0x5a,0xdc,0x00,0x00,0x82,0x06
+
+# GFX90A: scratch_load_dwordx4 a[6:9], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0x06]
+0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0x06
+
+# GFX90A: scratch_load_dwordx4 a[252:255], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0xfc]
+0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0xfc
+
+# GFX90A: scratch_load_dwordx4 a[6:9], off, s101 offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xe5,0x06]
+0xff,0x5f,0x5c,0xdc,0x00,0x00,0xe5,0x06
+
+# GFX90A: scratch_load_dwordx4 a[6:9], off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xe6,0x06]
+0xff,0x5f,0x5c,0xdc,0x00,0x00,0xe6,0x06
+
+# GFX90A: scratch_load_dwordx4 a[6:9], off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xe7,0x06]
+0xff,0x5f,0x5c,0xdc,0x00,0x00,0xe7,0x06
+
+# GFX90A: scratch_load_dwordx4 a[6:9], off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xea,0x06]
+0xff,0x5f,0x5c,0xdc,0x00,0x00,0xea,0x06
+
+# GFX90A: scratch_load_dwordx4 a[6:9], off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xeb,0x06]
+0xff,0x5f,0x5c,0xdc,0x00,0x00,0xeb,0x06
+
+# GFX90A: scratch_load_dwordx4 a[6:9], v0, off offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xff,0x06]
+0xff,0x5f,0x5c,0xdc,0x00,0x00,0xff,0x06
+
+# GFX90A: scratch_load_dwordx4 a[6:9], off, s2 ; encoding: [0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x06]
+0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x06
+
+# GFX90A: scratch_load_dwordx4 a[6:9], off, s2 ; encoding: [0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x06]
+0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x06
+
+# GFX90A: scratch_load_dwordx4 a[6:9], off, s2 offset:4095 ; encoding: [0xff,0x4f,0x5c,0xdc,0x00,0x00,0x82,0x06]
+0xff,0x4f,0x5c,0xdc,0x00,0x00,0x82,0x06
+
+# GFX90A: scratch_load_dwordx4 a[6:9], off, s2 offset:-4096 ; encoding: [0x00,0x50,0x5c,0xdc,0x00,0x00,0x82,0x06]
+0x00,0x50,0x5c,0xdc,0x00,0x00,0x82,0x06
+
+# GFX90A: scratch_load_dwordx4 a[6:9], off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x5d,0xdc,0x00,0x00,0x82,0x06]
+0xff,0x5f,0x5d,0xdc,0x00,0x00,0x82,0x06
+
+# GFX90A: scratch_load_dwordx4 a[6:9], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x5e,0xdc,0x00,0x00,0x82,0x06]
+0xff,0x5f,0x5e,0xdc,0x00,0x00,0x82,0x06
+
+# GFX90A: scratch_store_byte off, a2, s3 offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x60,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_byte off, a255, s3 offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0xff,0x83,0x00]
+0xff,0x5f,0x60,0xdc,0x00,0xff,0x83,0x00
+
+# GFX90A: scratch_store_byte off, a2, s101 offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xe5,0x00]
+0xff,0x5f,0x60,0xdc,0x00,0x02,0xe5,0x00
+
+# GFX90A: scratch_store_byte off, a2, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xe6,0x00]
+0xff,0x5f,0x60,0xdc,0x00,0x02,0xe6,0x00
+
+# GFX90A: scratch_store_byte off, a2, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xe7,0x00]
+0xff,0x5f,0x60,0xdc,0x00,0x02,0xe7,0x00
+
+# GFX90A: scratch_store_byte off, a2, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xea,0x00]
+0xff,0x5f,0x60,0xdc,0x00,0x02,0xea,0x00
+
+# GFX90A: scratch_store_byte off, a2, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xeb,0x00]
+0xff,0x5f,0x60,0xdc,0x00,0x02,0xeb,0x00
+
+# GFX90A: scratch_store_byte v0, a2, off offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xff,0x00]
+0xff,0x5f,0x60,0xdc,0x00,0x02,0xff,0x00
+
+# GFX90A: scratch_store_byte off, a2, s3  ; encoding: [0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_byte off, a2, s3  ; encoding: [0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_byte off, a2, s3 offset:4095 ; encoding: [0xff,0x4f,0x60,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x4f,0x60,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_byte off, a2, s3 offset:-4096 ; encoding: [0x00,0x50,0x60,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x50,0x60,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_byte off, a2, s3 offset:-1 glc ; encoding: [0xff,0x5f,0x61,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x61,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_byte off, a2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x62,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x62,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_byte_d16_hi off, a2, s3 offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x64,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_byte_d16_hi off, a255, s3 offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0xff,0x83,0x00]
+0xff,0x5f,0x64,0xdc,0x00,0xff,0x83,0x00
+
+# GFX90A: scratch_store_byte_d16_hi off, a2, s101 offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xe5,0x00]
+0xff,0x5f,0x64,0xdc,0x00,0x02,0xe5,0x00
+
+# GFX90A: scratch_store_byte_d16_hi off, a2, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xe6,0x00]
+0xff,0x5f,0x64,0xdc,0x00,0x02,0xe6,0x00
+
+# GFX90A: scratch_store_byte_d16_hi off, a2, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xe7,0x00]
+0xff,0x5f,0x64,0xdc,0x00,0x02,0xe7,0x00
+
+# GFX90A: scratch_store_byte_d16_hi off, a2, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xea,0x00]
+0xff,0x5f,0x64,0xdc,0x00,0x02,0xea,0x00
+
+# GFX90A: scratch_store_byte_d16_hi off, a2, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xeb,0x00]
+0xff,0x5f,0x64,0xdc,0x00,0x02,0xeb,0x00
+
+# GFX90A: scratch_store_byte_d16_hi v0, a2, off offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xff,0x00]
+0xff,0x5f,0x64,0xdc,0x00,0x02,0xff,0x00
+
+# GFX90A: scratch_store_byte_d16_hi off, a2, s3 ; encoding: [0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_byte_d16_hi off, a2, s3 ; encoding: [0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_byte_d16_hi off, a2, s3 offset:4095 ; encoding: [0xff,0x4f,0x64,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x4f,0x64,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_byte_d16_hi off, a2, s3 offset:-4096 ; encoding: [0x00,0x50,0x64,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x50,0x64,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_byte_d16_hi off, a2, s3 offset:-1 glc ; encoding: [0xff,0x5f,0x65,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x65,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_byte_d16_hi off, a2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x66,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x66,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_short off, a2, s3 offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x68,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_short off, a255, s3 offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0xff,0x83,0x00]
+0xff,0x5f,0x68,0xdc,0x00,0xff,0x83,0x00
+
+# GFX90A: scratch_store_short off, a2, s101 offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xe5,0x00]
+0xff,0x5f,0x68,0xdc,0x00,0x02,0xe5,0x00
+
+# GFX90A: scratch_store_short off, a2, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xe6,0x00]
+0xff,0x5f,0x68,0xdc,0x00,0x02,0xe6,0x00
+
+# GFX90A: scratch_store_short off, a2, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xe7,0x00]
+0xff,0x5f,0x68,0xdc,0x00,0x02,0xe7,0x00
+
+# GFX90A: scratch_store_short off, a2, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xea,0x00]
+0xff,0x5f,0x68,0xdc,0x00,0x02,0xea,0x00
+
+# GFX90A: scratch_store_short off, a2, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xeb,0x00]
+0xff,0x5f,0x68,0xdc,0x00,0x02,0xeb,0x00
+
+# GFX90A: scratch_store_short v0, a2, off offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xff,0x00]
+0xff,0x5f,0x68,0xdc,0x00,0x02,0xff,0x00
+
+# GFX90A: scratch_store_short off, a2, s3 ; encoding: [0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_short off, a2, s3 ; encoding: [0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_short off, a2, s3 offset:4095 ; encoding: [0xff,0x4f,0x68,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x4f,0x68,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_short off, a2, s3 offset:-4096 ; encoding: [0x00,0x50,0x68,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x50,0x68,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_short off, a2, s3 offset:-1 glc ; encoding: [0xff,0x5f,0x69,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x69,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_short off, a2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x6a,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x6a,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_short_d16_hi off, a2, s3 offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x6c,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_short_d16_hi off, a255, s3 offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0xff,0x83,0x00]
+0xff,0x5f,0x6c,0xdc,0x00,0xff,0x83,0x00
+
+# GFX90A: scratch_store_short_d16_hi off, a2, s101 offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xe5,0x00]
+0xff,0x5f,0x6c,0xdc,0x00,0x02,0xe5,0x00
+
+# GFX90A: scratch_store_short_d16_hi off, a2, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xe6,0x00]
+0xff,0x5f,0x6c,0xdc,0x00,0x02,0xe6,0x00
+
+# GFX90A: scratch_store_short_d16_hi off, a2, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xe7,0x00]
+0xff,0x5f,0x6c,0xdc,0x00,0x02,0xe7,0x00
+
+# GFX90A: scratch_store_short_d16_hi off, a2, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xea,0x00]
+0xff,0x5f,0x6c,0xdc,0x00,0x02,0xea,0x00
+
+# GFX90A: scratch_store_short_d16_hi off, a2, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xeb,0x00]
+0xff,0x5f,0x6c,0xdc,0x00,0x02,0xeb,0x00
+
+# GFX90A: scratch_store_short_d16_hi v0, a2, off offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xff,0x00]
+0xff,0x5f,0x6c,0xdc,0x00,0x02,0xff,0x00
+
+# GFX90A: scratch_store_short_d16_hi off, a2, s3 ; encoding: [0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_short_d16_hi off, a2, s3 ; encoding: [0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_short_d16_hi off, a2, s3 offset:4095 ; encoding: [0xff,0x4f,0x6c,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x4f,0x6c,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_short_d16_hi off, a2, s3 offset:-4096 ; encoding: [0x00,0x50,0x6c,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x50,0x6c,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_short_d16_hi off, a2, s3 offset:-1 glc ; encoding: [0xff,0x5f,0x6d,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x6d,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_short_d16_hi off, a2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x6e,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x6e,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dword off, a2, s3 offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x70,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dword off, a255, s3 offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0xff,0x83,0x00]
+0xff,0x5f,0x70,0xdc,0x00,0xff,0x83,0x00
+
+# GFX90A: scratch_store_dword off, a2, s101 offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xe5,0x00]
+0xff,0x5f,0x70,0xdc,0x00,0x02,0xe5,0x00
+
+# GFX90A: scratch_store_dword off, a2, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xe6,0x00]
+0xff,0x5f,0x70,0xdc,0x00,0x02,0xe6,0x00
+
+# GFX90A: scratch_store_dword off, a2, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xe7,0x00]
+0xff,0x5f,0x70,0xdc,0x00,0x02,0xe7,0x00
+
+# GFX90A: scratch_store_dword off, a2, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xea,0x00]
+0xff,0x5f,0x70,0xdc,0x00,0x02,0xea,0x00
+
+# GFX90A: scratch_store_dword off, a2, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xeb,0x00]
+0xff,0x5f,0x70,0xdc,0x00,0x02,0xeb,0x00
+
+# GFX90A: scratch_store_dword v0, a2, off offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xff,0x00]
+0xff,0x5f,0x70,0xdc,0x00,0x02,0xff,0x00
+
+# GFX90A: scratch_store_dword off, a2, s3 ; encoding: [0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dword off, a2, s3 ; encoding: [0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dword off, a2, s3 offset:4095 ; encoding: [0xff,0x4f,0x70,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x4f,0x70,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dword off, a2, s3 offset:-4096 ; encoding: [0x00,0x50,0x70,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x50,0x70,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dword off, a2, s3 offset:-1 glc ; encoding: [0xff,0x5f,0x71,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x71,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dword off, a2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x72,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x72,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dwordx2 off, a[2:3], s3 offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x74,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dwordx2 off, a[254:255], s3 offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0xfe,0x83,0x00]
+0xff,0x5f,0x74,0xdc,0x00,0xfe,0x83,0x00
+
+# GFX90A: scratch_store_dwordx2 off, a[2:3], s101 offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xe5,0x00]
+0xff,0x5f,0x74,0xdc,0x00,0x02,0xe5,0x00
+
+# GFX90A: scratch_store_dwordx2 off, a[2:3], flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xe6,0x00]
+0xff,0x5f,0x74,0xdc,0x00,0x02,0xe6,0x00
+
+# GFX90A: scratch_store_dwordx2 off, a[2:3], flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xe7,0x00]
+0xff,0x5f,0x74,0xdc,0x00,0x02,0xe7,0x00
+
+# GFX90A: scratch_store_dwordx2 off, a[2:3], vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xea,0x00]
+0xff,0x5f,0x74,0xdc,0x00,0x02,0xea,0x00
+
+# GFX90A: scratch_store_dwordx2 off, a[2:3], vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xeb,0x00]
+0xff,0x5f,0x74,0xdc,0x00,0x02,0xeb,0x00
+
+# GFX90A: scratch_store_dwordx2 v0, a[2:3], off offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xff,0x00]
+0xff,0x5f,0x74,0xdc,0x00,0x02,0xff,0x00
+
+# GFX90A: scratch_store_dwordx2 off, a[2:3], s3 ; encoding: [0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dwordx2 off, a[2:3], s3 ; encoding: [0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dwordx2 off, a[2:3], s3 offset:4095 ; encoding: [0xff,0x4f,0x74,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x4f,0x74,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dwordx2 off, a[2:3], s3 offset:-4096 ; encoding: [0x00,0x50,0x74,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x50,0x74,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dwordx2 off, a[2:3], s3 offset:-1 glc ; encoding: [0xff,0x5f,0x75,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x75,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dwordx2 off, a[2:3], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x76,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x76,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dwordx3 off, a[2:4], s3 offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x78,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dwordx3 off, a[252:254], s3 offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0xfc,0x83,0x00]
+0xff,0x5f,0x78,0xdc,0x00,0xfc,0x83,0x00
+
+# GFX90A: scratch_store_dwordx3 off, a[2:4], s101 offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xe5,0x00]
+0xff,0x5f,0x78,0xdc,0x00,0x02,0xe5,0x00
+
+# GFX90A: scratch_store_dwordx3 off, a[2:4], flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xe6,0x00]
+0xff,0x5f,0x78,0xdc,0x00,0x02,0xe6,0x00
+
+# GFX90A: scratch_store_dwordx3 off, a[2:4], flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xe7,0x00]
+0xff,0x5f,0x78,0xdc,0x00,0x02,0xe7,0x00
+
+# GFX90A: scratch_store_dwordx3 off, a[2:4], vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xea,0x00]
+0xff,0x5f,0x78,0xdc,0x00,0x02,0xea,0x00
+
+# GFX90A: scratch_store_dwordx3 off, a[2:4], vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xeb,0x00]
+0xff,0x5f,0x78,0xdc,0x00,0x02,0xeb,0x00
+
+# GFX90A: scratch_store_dwordx3 v0, a[2:4], off offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xff,0x00]
+0xff,0x5f,0x78,0xdc,0x00,0x02,0xff,0x00
+
+# GFX90A: scratch_store_dwordx3 off, a[2:4], s3 ; encoding: [0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dwordx3 off, a[2:4], s3 ; encoding: [0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dwordx3 off, a[2:4], s3 offset:4095 ; encoding: [0xff,0x4f,0x78,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x4f,0x78,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dwordx3 off, a[2:4], s3 offset:-4096 ; encoding: [0x00,0x50,0x78,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x50,0x78,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dwordx3 off, a[2:4], s3 offset:-1 glc ; encoding: [0xff,0x5f,0x79,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x79,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dwordx3 off, a[2:4], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x7a,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x7a,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dwordx4 off, a[2:5], s3 offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x7c,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dwordx4 off, a[252:255], s3 offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0xfc,0x83,0x00]
+0xff,0x5f,0x7c,0xdc,0x00,0xfc,0x83,0x00
+
+# GFX90A: scratch_store_dwordx4 off, a[2:5], s101 offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xe5,0x00]
+0xff,0x5f,0x7c,0xdc,0x00,0x02,0xe5,0x00
+
+# GFX90A: scratch_store_dwordx4 off, a[2:5], flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xe6,0x00]
+0xff,0x5f,0x7c,0xdc,0x00,0x02,0xe6,0x00
+
+# GFX90A: scratch_store_dwordx4 off, a[2:5], flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xe7,0x00]
+0xff,0x5f,0x7c,0xdc,0x00,0x02,0xe7,0x00
+
+# GFX90A: scratch_store_dwordx4 off, a[2:5], vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xea,0x00]
+0xff,0x5f,0x7c,0xdc,0x00,0x02,0xea,0x00
+
+# GFX90A: scratch_store_dwordx4 off, a[2:5], vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xeb,0x00]
+0xff,0x5f,0x7c,0xdc,0x00,0x02,0xeb,0x00
+
+# GFX90A: scratch_store_dwordx4 v0, a[2:5], off offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xff,0x00]
+0xff,0x5f,0x7c,0xdc,0x00,0x02,0xff,0x00
+
+# GFX90A: scratch_store_dwordx4 off, a[2:5], s3 ; encoding: [0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dwordx4 off, a[2:5], s3 ; encoding: [0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dwordx4 off, a[2:5], s3 offset:4095 ; encoding: [0xff,0x4f,0x7c,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x4f,0x7c,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dwordx4 off, a[2:5], s3 offset:-4096 ; encoding: [0x00,0x50,0x7c,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x50,0x7c,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dwordx4 off, a[2:5], s3 offset:-1 glc ; encoding: [0xff,0x5f,0x7d,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x7d,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_store_dwordx4 off, a[2:5], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x7e,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x7e,0xdc,0x00,0x02,0x83,0x00
+
+# GFX90A: scratch_load_ubyte_d16 a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ubyte_d16 a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0xff]
+0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0xff
+
+# GFX90A: scratch_load_ubyte_d16 a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xe5,0x05]
+0xff,0x5f,0x80,0xdc,0x00,0x00,0xe5,0x05
+
+# GFX90A: scratch_load_ubyte_d16 a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xe6,0x05]
+0xff,0x5f,0x80,0xdc,0x00,0x00,0xe6,0x05
+
+# GFX90A: scratch_load_ubyte_d16 a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xe7,0x05]
+0xff,0x5f,0x80,0xdc,0x00,0x00,0xe7,0x05
+
+# GFX90A: scratch_load_ubyte_d16 a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xea,0x05]
+0xff,0x5f,0x80,0xdc,0x00,0x00,0xea,0x05
+
+# GFX90A: scratch_load_ubyte_d16 a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xeb,0x05]
+0xff,0x5f,0x80,0xdc,0x00,0x00,0xeb,0x05
+
+# GFX90A: scratch_load_ubyte_d16 a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xff,0x05]
+0xff,0x5f,0x80,0xdc,0x00,0x00,0xff,0x05
+
+# GFX90A: scratch_load_ubyte_d16 a5, off, s2 ; encoding: [0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ubyte_d16 a5, off, s2 ; encoding: [0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ubyte_d16 a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x80,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x4f,0x80,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ubyte_d16 a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x80,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x50,0x80,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ubyte_d16 a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x81,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x81,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ubyte_d16 a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x82,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x82,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ubyte_d16_hi a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0xff]
+0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0xff
+
+# GFX90A: scratch_load_ubyte_d16_hi a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xe5,0x05]
+0xff,0x5f,0x84,0xdc,0x00,0x00,0xe5,0x05
+
+# GFX90A: scratch_load_ubyte_d16_hi a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xe6,0x05]
+0xff,0x5f,0x84,0xdc,0x00,0x00,0xe6,0x05
+
+# GFX90A: scratch_load_ubyte_d16_hi a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xe7,0x05]
+0xff,0x5f,0x84,0xdc,0x00,0x00,0xe7,0x05
+
+# GFX90A: scratch_load_ubyte_d16_hi a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xea,0x05]
+0xff,0x5f,0x84,0xdc,0x00,0x00,0xea,0x05
+
+# GFX90A: scratch_load_ubyte_d16_hi a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xeb,0x05]
+0xff,0x5f,0x84,0xdc,0x00,0x00,0xeb,0x05
+
+# GFX90A: scratch_load_ubyte_d16_hi a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xff,0x05]
+0xff,0x5f,0x84,0xdc,0x00,0x00,0xff,0x05
+
+# GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x84,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x4f,0x84,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x84,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x50,0x84,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x85,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x85,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x86,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x86,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sbyte_d16 a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sbyte_d16 a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0xff]
+0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0xff
+
+# GFX90A: scratch_load_sbyte_d16 a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xe5,0x05]
+0xff,0x5f,0x88,0xdc,0x00,0x00,0xe5,0x05
+
+# GFX90A: scratch_load_sbyte_d16 a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xe6,0x05]
+0xff,0x5f,0x88,0xdc,0x00,0x00,0xe6,0x05
+
+# GFX90A: scratch_load_sbyte_d16 a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xe7,0x05]
+0xff,0x5f,0x88,0xdc,0x00,0x00,0xe7,0x05
+
+# GFX90A: scratch_load_sbyte_d16 a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xea,0x05]
+0xff,0x5f,0x88,0xdc,0x00,0x00,0xea,0x05
+
+# GFX90A: scratch_load_sbyte_d16 a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xeb,0x05]
+0xff,0x5f,0x88,0xdc,0x00,0x00,0xeb,0x05
+
+# GFX90A: scratch_load_sbyte_d16 a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xff,0x05]
+0xff,0x5f,0x88,0xdc,0x00,0x00,0xff,0x05
+
+# GFX90A: scratch_load_sbyte_d16 a5, off, s2 ; encoding: [0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sbyte_d16 a5, off, s2 ; encoding: [0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sbyte_d16 a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x88,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x4f,0x88,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sbyte_d16 a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x88,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x50,0x88,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sbyte_d16 a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x89,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x89,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sbyte_d16 a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x8a,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x8a,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sbyte_d16_hi a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0xff]
+0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0xff
+
+# GFX90A: scratch_load_sbyte_d16_hi a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xe5,0x05]
+0xff,0x5f,0x8c,0xdc,0x00,0x00,0xe5,0x05
+
+# GFX90A: scratch_load_sbyte_d16_hi a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xe6,0x05]
+0xff,0x5f,0x8c,0xdc,0x00,0x00,0xe6,0x05
+
+# GFX90A: scratch_load_sbyte_d16_hi a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xe7,0x05]
+0xff,0x5f,0x8c,0xdc,0x00,0x00,0xe7,0x05
+
+# GFX90A: scratch_load_sbyte_d16_hi a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xea,0x05]
+0xff,0x5f,0x8c,0xdc,0x00,0x00,0xea,0x05
+
+# GFX90A: scratch_load_sbyte_d16_hi a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xeb,0x05]
+0xff,0x5f,0x8c,0xdc,0x00,0x00,0xeb,0x05
+
+# GFX90A: scratch_load_sbyte_d16_hi a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xff,0x05]
+0xff,0x5f,0x8c,0xdc,0x00,0x00,0xff,0x05
+
+# GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x8c,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x4f,0x8c,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x8c,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x50,0x8c,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x8d,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x8d,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x8e,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x8e,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_short_d16 a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_short_d16 a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0xff]
+0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0xff
+
+# GFX90A: scratch_load_short_d16 a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xe5,0x05]
+0xff,0x5f,0x90,0xdc,0x00,0x00,0xe5,0x05
+
+# GFX90A: scratch_load_short_d16 a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xe6,0x05]
+0xff,0x5f,0x90,0xdc,0x00,0x00,0xe6,0x05
+
+# GFX90A: scratch_load_short_d16 a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xe7,0x05]
+0xff,0x5f,0x90,0xdc,0x00,0x00,0xe7,0x05
+
+# GFX90A: scratch_load_short_d16 a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xea,0x05]
+0xff,0x5f,0x90,0xdc,0x00,0x00,0xea,0x05
+
+# GFX90A: scratch_load_short_d16 a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xeb,0x05]
+0xff,0x5f,0x90,0xdc,0x00,0x00,0xeb,0x05
+
+# GFX90A: scratch_load_short_d16 a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xff,0x05]
+0xff,0x5f,0x90,0xdc,0x00,0x00,0xff,0x05
+
+# GFX90A: scratch_load_short_d16 a5, off, s2 ; encoding: [0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_short_d16 a5, off, s2 ; encoding: [0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_short_d16 a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x90,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x4f,0x90,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_short_d16 a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x90,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x50,0x90,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_short_d16 a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x91,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x91,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_short_d16 a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x92,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x92,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_short_d16_hi a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_short_d16_hi a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0xff]
+0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0xff
+
+# GFX90A: scratch_load_short_d16_hi a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xe5,0x05]
+0xff,0x5f,0x94,0xdc,0x00,0x00,0xe5,0x05
+
+# GFX90A: scratch_load_short_d16_hi a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xe6,0x05]
+0xff,0x5f,0x94,0xdc,0x00,0x00,0xe6,0x05
+
+# GFX90A: scratch_load_short_d16_hi a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xe7,0x05]
+0xff,0x5f,0x94,0xdc,0x00,0x00,0xe7,0x05
+
+# GFX90A: scratch_load_short_d16_hi a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xea,0x05]
+0xff,0x5f,0x94,0xdc,0x00,0x00,0xea,0x05
+
+# GFX90A: scratch_load_short_d16_hi a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xeb,0x05]
+0xff,0x5f,0x94,0xdc,0x00,0x00,0xeb,0x05
+
+# GFX90A: scratch_load_short_d16_hi a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xff,0x05]
+0xff,0x5f,0x94,0xdc,0x00,0x00,0xff,0x05
+
+# GFX90A: scratch_load_short_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_short_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_short_d16_hi a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x94,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x4f,0x94,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_short_d16_hi a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x94,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x50,0x94,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_short_d16_hi a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x95,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x95,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: scratch_load_short_d16_hi a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x96,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x96,0xdc,0x00,0x00,0x82,0x05
+
+# GFX90A: buffer_load_format_x a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_x a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x00,0xe0,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_load_format_x a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x00,0xe0,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_load_format_x a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x00,0xe0,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_load_format_x a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_load_format_x a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_load_format_x a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_load_format_x a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_load_format_x a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0xf0]
+0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0xf0
+
+# GFX90A: buffer_load_format_x a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0xf7]
+0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0xf7
+
+# GFX90A: buffer_load_format_x a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x00,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x00,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_x a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x00,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x00,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_x a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x00,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x00,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_x a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x00,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x00,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_x a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x00,0xe0,0x00,0x05,0x82,0x03]
+0x07,0x00,0x00,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_x a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x00,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x00,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_x a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x02,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x02,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_x a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x01,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x01,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xy a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0xfe,0x82,0x03]
+0xff,0x0f,0x04,0xe0,0x00,0xfe,0x82,0x03
+
+# GFX90A: buffer_load_format_xy a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x83,0x03]
+0xff,0x0f,0x04,0xe0,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_load_format_xy a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x98,0x03]
+0xff,0x0f,0x04,0xe0,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x65]
+0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x80]
+0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0xf0]
+0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0xf0
+
+# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0xf7]
+0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0xf7
+
+# GFX90A: buffer_load_format_xy a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x04,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x2f,0x04,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xy a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x04,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x1f,0x04,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x04,0xe0,0x00,0x06,0x82,0x03]
+0x00,0x00,0x04,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x04,0xe0,0x00,0x06,0x82,0x03]
+0x00,0x00,0x04,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x04,0xe0,0x00,0x06,0x82,0x03]
+0x07,0x00,0x04,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x04,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x4f,0x04,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x06,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x06,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xyz a[252:254], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0xfc,0x82,0x03]
+0xff,0x0f,0x08,0xe0,0x00,0xfc,0x82,0x03
+
+# GFX90A: buffer_load_format_xyz a[6:8], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x83,0x03]
+0xff,0x0f,0x08,0xe0,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_load_format_xyz a[6:8], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x98,0x03]
+0xff,0x0f,0x08,0xe0,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x65]
+0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x80]
+0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0xf0]
+0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0xf0
+
+# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0xf7]
+0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0xf7
+
+# GFX90A: buffer_load_format_xyz a[6:8], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x08,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x2f,0x08,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xyz a[6:8], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x08,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x1f,0x08,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 ; encoding: [0x00,0x00,0x08,0xe0,0x00,0x06,0x82,0x03]
+0x00,0x00,0x08,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 ; encoding: [0x00,0x00,0x08,0xe0,0x00,0x06,0x82,0x03]
+0x00,0x00,0x08,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x08,0xe0,0x00,0x06,0x82,0x03]
+0x07,0x00,0x08,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x08,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x4f,0x08,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x0a,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x0a,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xyzw a[252:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0xfc,0x82,0x03]
+0xff,0x0f,0x0c,0xe0,0x00,0xfc,0x82,0x03
+
+# GFX90A: buffer_load_format_xyzw a[6:9], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x83,0x03]
+0xff,0x0f,0x0c,0xe0,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_load_format_xyzw a[6:9], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x98,0x03]
+0xff,0x0f,0x0c,0xe0,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x65]
+0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x80]
+0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0xf0]
+0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0xf0
+
+# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0xf7]
+0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0xf7
+
+# GFX90A: buffer_load_format_xyzw a[6:9], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x0c,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x2f,0x0c,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xyzw a[6:9], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x0c,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x1f,0x0c,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x0c,0xe0,0x00,0x06,0x82,0x03]
+0x00,0x00,0x0c,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x0c,0xe0,0x00,0x06,0x82,0x03]
+0x00,0x00,0x0c,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x0c,0xe0,0x00,0x06,0x82,0x03]
+0x07,0x00,0x0c,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x0c,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x4f,0x0c,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x0e,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x0e,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_store_format_x a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_x a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0xff,0x83,0x04]
+0xff,0x0f,0x10,0xe0,0x00,0xff,0x83,0x04
+
+# GFX90A: buffer_store_format_x a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x84,0x04]
+0xff,0x0f,0x10,0xe0,0x00,0x01,0x84,0x04
+
+# GFX90A: buffer_store_format_x a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x98,0x04]
+0xff,0x0f,0x10,0xe0,0x00,0x01,0x98,0x04
+
+# GFX90A: buffer_store_format_x a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x65]
+0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x65
+
+# GFX90A: buffer_store_format_x a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x7c]
+0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x7c
+
+# GFX90A: buffer_store_format_x a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x80]
+0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x80
+
+# GFX90A: buffer_store_format_x a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0xc1]
+0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0xc1
+
+# GFX90A: buffer_store_format_x a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0xf0]
+0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0xf0
+
+# GFX90A: buffer_store_format_x a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0xf7]
+0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0xf7
+
+# GFX90A: buffer_store_format_x a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x10,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x2f,0x10,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_x a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x10,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x1f,0x10,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_x a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x10,0xe0,0x00,0x01,0x83,0x04]
+0x00,0x00,0x10,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_x a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x10,0xe0,0x00,0x01,0x83,0x04]
+0x00,0x00,0x10,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_x a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x10,0xe0,0x00,0x01,0x83,0x04]
+0x07,0x00,0x10,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_x a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x10,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x4f,0x10,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_x a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x12,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x0f,0x12,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xy a[254:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0xfe,0x83,0x04]
+0xff,0x0f,0x14,0xe0,0x00,0xfe,0x83,0x04
+
+# GFX90A: buffer_store_format_xy a[2:3], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x84,0x04]
+0xff,0x0f,0x14,0xe0,0x00,0x02,0x84,0x04
+
+# GFX90A: buffer_store_format_xy a[2:3], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x98,0x04]
+0xff,0x0f,0x14,0xe0,0x00,0x02,0x98,0x04
+
+# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x65]
+0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x65
+
+# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x7c]
+0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x7c
+
+# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x80]
+0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x80
+
+# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0xc1]
+0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0xc1
+
+# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0xf0]
+0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0xf0
+
+# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0xf7]
+0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0xf7
+
+# GFX90A: buffer_store_format_xy a[2:3], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x14,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x2f,0x14,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xy a[2:3], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x14,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x1f,0x14,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x14,0xe0,0x00,0x02,0x83,0x04]
+0x00,0x00,0x14,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x14,0xe0,0x00,0x02,0x83,0x04]
+0x00,0x00,0x14,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x14,0xe0,0x00,0x02,0x83,0x04]
+0x07,0x00,0x14,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x14,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x4f,0x14,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x16,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x0f,0x16,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xyz a[252:254], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0xfc,0x83,0x04]
+0xff,0x0f,0x18,0xe0,0x00,0xfc,0x83,0x04
+
+# GFX90A: buffer_store_format_xyz a[2:4], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x84,0x04]
+0xff,0x0f,0x18,0xe0,0x00,0x02,0x84,0x04
+
+# GFX90A: buffer_store_format_xyz a[2:4], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x98,0x04]
+0xff,0x0f,0x18,0xe0,0x00,0x02,0x98,0x04
+
+# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x65]
+0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x65
+
+# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x7c]
+0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x7c
+
+# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x80]
+0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x80
+
+# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0xc1]
+0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0xc1
+
+# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0xf0]
+0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0xf0
+
+# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0xf7]
+0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0xf7
+
+# GFX90A: buffer_store_format_xyz a[2:4], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x18,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x2f,0x18,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xyz a[2:4], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x18,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x1f,0x18,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 ; encoding: [0x00,0x00,0x18,0xe0,0x00,0x02,0x83,0x04]
+0x00,0x00,0x18,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 ; encoding: [0x00,0x00,0x18,0xe0,0x00,0x02,0x83,0x04]
+0x00,0x00,0x18,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x18,0xe0,0x00,0x02,0x83,0x04]
+0x07,0x00,0x18,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x18,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x4f,0x18,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x1a,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x0f,0x1a,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xyzw a[252:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0xfc,0x83,0x04]
+0xff,0x0f,0x1c,0xe0,0x00,0xfc,0x83,0x04
+
+# GFX90A: buffer_store_format_xyzw a[2:5], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x84,0x04]
+0xff,0x0f,0x1c,0xe0,0x00,0x02,0x84,0x04
+
+# GFX90A: buffer_store_format_xyzw a[2:5], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x98,0x04]
+0xff,0x0f,0x1c,0xe0,0x00,0x02,0x98,0x04
+
+# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x65]
+0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x65
+
+# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x7c]
+0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x7c
+
+# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x80]
+0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x80
+
+# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0xc1]
+0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0xc1
+
+# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0xf0]
+0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0xf0
+
+# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0xf7]
+0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0xf7
+
+# GFX90A: buffer_store_format_xyzw a[2:5], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x1c,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x2f,0x1c,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xyzw a[2:5], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x1c,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x1f,0x1c,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 ; encoding: [0x00,0x00,0x1c,0xe0,0x00,0x02,0x83,0x04]
+0x00,0x00,0x1c,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 ; encoding: [0x00,0x00,0x1c,0xe0,0x00,0x02,0x83,0x04]
+0x00,0x00,0x1c,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x1c,0xe0,0x00,0x02,0x83,0x04]
+0x07,0x00,0x1c,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x1c,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x4f,0x1c,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x1e,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x0f,0x1e,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_x a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x20,0xe0,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_x a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x20,0xe0,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_load_format_d16_x a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x20,0xe0,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0xf0]
+0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0xf0
+
+# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0xf7]
+0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0xf7
+
+# GFX90A: buffer_load_format_d16_x a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x20,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x20,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_x a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x20,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x20,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x20,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x20,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x20,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x20,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x20,0xe0,0x00,0x05,0x82,0x03]
+0x07,0x00,0x20,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x20,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x20,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x22,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x22,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xy a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x24,0xe0,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xy a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x24,0xe0,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_load_format_d16_xy a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x24,0xe0,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0xf0]
+0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0xf0
+
+# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0xf7]
+0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0xf7
+
+# GFX90A: buffer_load_format_d16_xy a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x24,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x24,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xy a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x24,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x24,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x24,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x24,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x24,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x24,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x24,0xe0,0x00,0x05,0x82,0x03]
+0x07,0x00,0x24,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x24,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x24,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x26,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x26,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xyz a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0xfe,0x82,0x03]
+0xff,0x0f,0x28,0xe0,0x00,0xfe,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x83,0x03]
+0xff,0x0f,0x28,0xe0,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x98,0x03]
+0xff,0x0f,0x28,0xe0,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x65]
+0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x80]
+0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0xf0]
+0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0xf0
+
+# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0xf7]
+0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0xf7
+
+# GFX90A: buffer_load_format_d16_xyz a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x28,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x2f,0x28,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xyz a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x28,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x1f,0x28,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x06,0x82,0x03]
+0x00,0x00,0x28,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x06,0x82,0x03]
+0x00,0x00,0x28,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x28,0xe0,0x00,0x06,0x82,0x03]
+0x07,0x00,0x28,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x28,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x4f,0x28,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x2a,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x2a,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xyzw a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0xfe,0x82,0x03]
+0xff,0x0f,0x2c,0xe0,0x00,0xfe,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x83,0x03]
+0xff,0x0f,0x2c,0xe0,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x98,0x03]
+0xff,0x0f,0x2c,0xe0,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x65]
+0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x80]
+0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0xf0]
+0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0xf0
+
+# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0xf7]
+0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0xf7
+
+# GFX90A: buffer_load_format_d16_xyzw a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x2c,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x2f,0x2c,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xyzw a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x2c,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x1f,0x2c,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x2c,0xe0,0x00,0x06,0x82,0x03]
+0x00,0x00,0x2c,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x2c,0xe0,0x00,0x06,0x82,0x03]
+0x00,0x00,0x2c,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x2c,0xe0,0x00,0x06,0x82,0x03]
+0x07,0x00,0x2c,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x2c,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x4f,0x2c,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x2e,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x2e,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_x a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0xff,0x83,0x04]
+0xff,0x0f,0x30,0xe0,0x00,0xff,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_x a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x84,0x04]
+0xff,0x0f,0x30,0xe0,0x00,0x01,0x84,0x04
+
+# GFX90A: buffer_store_format_d16_x a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x98,0x04]
+0xff,0x0f,0x30,0xe0,0x00,0x01,0x98,0x04
+
+# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x65]
+0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x65
+
+# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x7c]
+0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x7c
+
+# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x80]
+0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x80
+
+# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0xc1]
+0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0xc1
+
+# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0xf0]
+0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0xf0
+
+# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0xf7]
+0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0xf7
+
+# GFX90A: buffer_store_format_d16_x a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x30,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x2f,0x30,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_x a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x30,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x1f,0x30,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x83,0x04]
+0x00,0x00,0x30,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x83,0x04]
+0x00,0x00,0x30,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x30,0xe0,0x00,0x01,0x83,0x04]
+0x07,0x00,0x30,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x30,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x4f,0x30,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x32,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x0f,0x32,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xy a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0xff,0x83,0x04]
+0xff,0x0f,0x34,0xe0,0x00,0xff,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xy a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x84,0x04]
+0xff,0x0f,0x34,0xe0,0x00,0x01,0x84,0x04
+
+# GFX90A: buffer_store_format_d16_xy a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x98,0x04]
+0xff,0x0f,0x34,0xe0,0x00,0x01,0x98,0x04
+
+# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x65]
+0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x65
+
+# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x7c]
+0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x7c
+
+# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x80]
+0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x80
+
+# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0xc1]
+0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0xc1
+
+# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0xf0]
+0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0xf0
+
+# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0xf7]
+0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0xf7
+
+# GFX90A: buffer_store_format_d16_xy a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x34,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x2f,0x34,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xy a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x34,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x1f,0x34,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x34,0xe0,0x00,0x01,0x83,0x04]
+0x00,0x00,0x34,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x34,0xe0,0x00,0x01,0x83,0x04]
+0x00,0x00,0x34,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x34,0xe0,0x00,0x01,0x83,0x04]
+0x07,0x00,0x34,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x34,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x4f,0x34,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x36,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x0f,0x36,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xyz a[254:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0xfe,0x83,0x04]
+0xff,0x0f,0x38,0xe0,0x00,0xfe,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x84,0x04]
+0xff,0x0f,0x38,0xe0,0x00,0x02,0x84,0x04
+
+# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x98,0x04]
+0xff,0x0f,0x38,0xe0,0x00,0x02,0x98,0x04
+
+# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x65]
+0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x65
+
+# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x7c]
+0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x7c
+
+# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x80]
+0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x80
+
+# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0xc1]
+0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0xc1
+
+# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0xf0]
+0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0xf0
+
+# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0xf7]
+0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0xf7
+
+# GFX90A: buffer_store_format_d16_xyz a[2:3], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x38,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x2f,0x38,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xyz a[2:3], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x38,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x1f,0x38,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x38,0xe0,0x00,0x02,0x83,0x04]
+0x00,0x00,0x38,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x38,0xe0,0x00,0x02,0x83,0x04]
+0x00,0x00,0x38,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x38,0xe0,0x00,0x02,0x83,0x04]
+0x07,0x00,0x38,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x38,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x4f,0x38,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x3a,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x0f,0x3a,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xyzw a[254:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0xfe,0x83,0x04]
+0xff,0x0f,0x3c,0xe0,0x00,0xfe,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x84,0x04]
+0xff,0x0f,0x3c,0xe0,0x00,0x02,0x84,0x04
+
+# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x98,0x04]
+0xff,0x0f,0x3c,0xe0,0x00,0x02,0x98,0x04
+
+# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x65]
+0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x65
+
+# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x7c]
+0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x7c
+
+# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x80]
+0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x80
+
+# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0xc1]
+0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0xc1
+
+# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0xf0]
+0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0xf0
+
+# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0xf7]
+0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0xf7
+
+# GFX90A: buffer_store_format_d16_xyzw a[2:3], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x3c,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x2f,0x3c,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xyzw a[2:3], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x3c,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x1f,0x3c,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x3c,0xe0,0x00,0x02,0x83,0x04]
+0x00,0x00,0x3c,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x3c,0xe0,0x00,0x02,0x83,0x04]
+0x00,0x00,0x3c,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x3c,0xe0,0x00,0x02,0x83,0x04]
+0x07,0x00,0x3c,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x3c,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x4f,0x3c,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x3e,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x0f,0x3e,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x40,0xe0,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_load_ubyte a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x40,0xe0,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_load_ubyte a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x40,0xe0,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_load_ubyte a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_load_ubyte a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_load_ubyte a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_load_ubyte a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_load_ubyte a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0xf0]
+0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0xf0
+
+# GFX90A: buffer_load_ubyte a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0xf7]
+0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0xf7
+
+# GFX90A: buffer_load_ubyte a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x40,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x40,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x40,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x40,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x40,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x40,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x40,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x40,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x40,0xe0,0x00,0x05,0x82,0x03]
+0x07,0x00,0x40,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x40,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x40,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x42,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x42,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x41,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x41,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x44,0xe0,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_load_sbyte a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x44,0xe0,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_load_sbyte a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x44,0xe0,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_load_sbyte a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_load_sbyte a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_load_sbyte a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_load_sbyte a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_load_sbyte a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0xf0]
+0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0xf0
+
+# GFX90A: buffer_load_sbyte a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0xf7]
+0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0xf7
+
+# GFX90A: buffer_load_sbyte a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x44,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x44,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x44,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x44,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x44,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x44,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x44,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x44,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x44,0xe0,0x00,0x05,0x82,0x03]
+0x07,0x00,0x44,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x44,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x44,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x46,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x46,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x45,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x45,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ushort a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ushort a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x48,0xe0,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_load_ushort a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x48,0xe0,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_load_ushort a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x48,0xe0,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_load_ushort a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_load_ushort a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_load_ushort a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_load_ushort a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_load_ushort a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0xf0]
+0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0xf0
+
+# GFX90A: buffer_load_ushort a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0xf7]
+0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0xf7
+
+# GFX90A: buffer_load_ushort a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x48,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x48,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ushort a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x48,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x48,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ushort a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x48,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ushort a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x48,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ushort a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x48,0xe0,0x00,0x05,0x82,0x03]
+0x07,0x00,0x48,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ushort a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x48,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x48,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ushort a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x4a,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x4a,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ushort a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x49,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x49,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sshort a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sshort a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x4c,0xe0,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_load_sshort a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x4c,0xe0,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_load_sshort a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x4c,0xe0,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_load_sshort a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_load_sshort a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_load_sshort a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_load_sshort a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_load_sshort a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0xf0]
+0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0xf0
+
+# GFX90A: buffer_load_sshort a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0xf7]
+0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0xf7
+
+# GFX90A: buffer_load_sshort a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x4c,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x4c,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sshort a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x4c,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x4c,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sshort a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x4c,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x4c,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sshort a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x4c,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x4c,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sshort a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x4c,0xe0,0x00,0x05,0x82,0x03]
+0x07,0x00,0x4c,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sshort a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x4c,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x4c,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sshort a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x4e,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x4e,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sshort a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x4d,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x4d,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_dword a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_dword a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x50,0xe0,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_load_dword a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x50,0xe0,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_load_dword a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x50,0xe0,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_load_dword a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_load_dword a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_load_dword a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_load_dword a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_load_dword a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0xf0]
+0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0xf0
+
+# GFX90A: buffer_load_dword a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0xf7]
+0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0xf7
+
+# GFX90A: buffer_load_dword a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x50,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x50,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_dword a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x50,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x50,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_dword a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x50,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x50,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_dword a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x50,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x50,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_dword a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x50,0xe0,0x00,0x05,0x82,0x03]
+0x07,0x00,0x50,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_dword a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x50,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x50,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_dword a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x52,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x52,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_dword a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x51,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x51,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0xfe,0x82,0x03]
+0xff,0x0f,0x54,0xe0,0x00,0xfe,0x82,0x03
+
+# GFX90A: buffer_load_dwordx2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x83,0x03]
+0xff,0x0f,0x54,0xe0,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_load_dwordx2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x98,0x03]
+0xff,0x0f,0x54,0xe0,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x65]
+0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x80]
+0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0xf0]
+0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0xf0
+
+# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0xf7]
+0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0xf7
+
+# GFX90A: buffer_load_dwordx2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x54,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x2f,0x54,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x54,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x1f,0x54,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x54,0xe0,0x00,0x06,0x82,0x03]
+0x00,0x00,0x54,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x54,0xe0,0x00,0x06,0x82,0x03]
+0x00,0x00,0x54,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x54,0xe0,0x00,0x06,0x82,0x03]
+0x07,0x00,0x54,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x54,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x4f,0x54,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x56,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x56,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx3 a[252:254], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0xfc,0x82,0x03]
+0xff,0x0f,0x58,0xe0,0x00,0xfc,0x82,0x03
+
+# GFX90A: buffer_load_dwordx3 a[6:8], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x83,0x03]
+0xff,0x0f,0x58,0xe0,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_load_dwordx3 a[6:8], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x98,0x03]
+0xff,0x0f,0x58,0xe0,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x65]
+0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x80]
+0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0xf0]
+0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0xf0
+
+# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0xf7]
+0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0xf7
+
+# GFX90A: buffer_load_dwordx3 a[6:8], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x58,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x2f,0x58,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx3 a[6:8], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x58,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x1f,0x58,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 ; encoding: [0x00,0x00,0x58,0xe0,0x00,0x06,0x82,0x03]
+0x00,0x00,0x58,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 ; encoding: [0x00,0x00,0x58,0xe0,0x00,0x06,0x82,0x03]
+0x00,0x00,0x58,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x58,0xe0,0x00,0x06,0x82,0x03]
+0x07,0x00,0x58,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x58,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x4f,0x58,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x5a,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x5a,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx4 a[252:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0xfc,0x82,0x03]
+0xff,0x0f,0x5c,0xe0,0x00,0xfc,0x82,0x03
+
+# GFX90A: buffer_load_dwordx4 a[6:9], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x83,0x03]
+0xff,0x0f,0x5c,0xe0,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_load_dwordx4 a[6:9], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x98,0x03]
+0xff,0x0f,0x5c,0xe0,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x65]
+0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x80]
+0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0xf0]
+0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0xf0
+
+# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0xf7]
+0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0xf7
+
+# GFX90A: buffer_load_dwordx4 a[6:9], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x5c,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x2f,0x5c,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx4 a[6:9], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x5c,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x1f,0x5c,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x5c,0xe0,0x00,0x06,0x82,0x03]
+0x00,0x00,0x5c,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x5c,0xe0,0x00,0x06,0x82,0x03]
+0x00,0x00,0x5c,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x5c,0xe0,0x00,0x06,0x82,0x03]
+0x07,0x00,0x5c,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x5c,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x4f,0x5c,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x5e,0xe0,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x5e,0xe0,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_store_byte a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_byte a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0xff,0x83,0x04]
+0xff,0x0f,0x60,0xe0,0x00,0xff,0x83,0x04
+
+# GFX90A: buffer_store_byte a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x84,0x04]
+0xff,0x0f,0x60,0xe0,0x00,0x01,0x84,0x04
+
+# GFX90A: buffer_store_byte a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x98,0x04]
+0xff,0x0f,0x60,0xe0,0x00,0x01,0x98,0x04
+
+# GFX90A: buffer_store_byte a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x65]
+0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x65
+
+# GFX90A: buffer_store_byte a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x7c]
+0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x7c
+
+# GFX90A: buffer_store_byte a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x80]
+0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x80
+
+# GFX90A: buffer_store_byte a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0xc1]
+0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0xc1
+
+# GFX90A: buffer_store_byte a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0xf0]
+0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0xf0
+
+# GFX90A: buffer_store_byte a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0xf7]
+0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0xf7
+
+# GFX90A: buffer_store_byte a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x60,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x2f,0x60,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_byte a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x60,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x1f,0x60,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_byte a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x60,0xe0,0x00,0x01,0x83,0x04]
+0x00,0x00,0x60,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_byte a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x60,0xe0,0x00,0x01,0x83,0x04]
+0x00,0x00,0x60,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_byte a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x60,0xe0,0x00,0x01,0x83,0x04]
+0x07,0x00,0x60,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_byte a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x60,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x4f,0x60,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_byte a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x62,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x0f,0x62,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_byte_d16_hi a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0xff,0x83,0x04]
+0xff,0x0f,0x64,0xe0,0x00,0xff,0x83,0x04
+
+# GFX90A: buffer_store_byte_d16_hi a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x84,0x04]
+0xff,0x0f,0x64,0xe0,0x00,0x01,0x84,0x04
+
+# GFX90A: buffer_store_byte_d16_hi a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x98,0x04]
+0xff,0x0f,0x64,0xe0,0x00,0x01,0x98,0x04
+
+# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x65]
+0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x65
+
+# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x7c]
+0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x7c
+
+# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x80]
+0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x80
+
+# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0xc1]
+0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0xc1
+
+# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0xf0]
+0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0xf0
+
+# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0xf7]
+0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0xf7
+
+# GFX90A: buffer_store_byte_d16_hi a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x64,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x2f,0x64,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_byte_d16_hi a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x64,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x1f,0x64,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x01,0x83,0x04]
+0x00,0x00,0x64,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x01,0x83,0x04]
+0x00,0x00,0x64,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x64,0xe0,0x00,0x01,0x83,0x04]
+0x07,0x00,0x64,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x64,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x4f,0x64,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x66,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x0f,0x66,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_short a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_short a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0xff,0x83,0x04]
+0xff,0x0f,0x68,0xe0,0x00,0xff,0x83,0x04
+
+# GFX90A: buffer_store_short a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x84,0x04]
+0xff,0x0f,0x68,0xe0,0x00,0x01,0x84,0x04
+
+# GFX90A: buffer_store_short a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x98,0x04]
+0xff,0x0f,0x68,0xe0,0x00,0x01,0x98,0x04
+
+# GFX90A: buffer_store_short a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x65]
+0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x65
+
+# GFX90A: buffer_store_short a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x7c]
+0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x7c
+
+# GFX90A: buffer_store_short a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x80]
+0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x80
+
+# GFX90A: buffer_store_short a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0xc1]
+0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0xc1
+
+# GFX90A: buffer_store_short a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0xf0]
+0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0xf0
+
+# GFX90A: buffer_store_short a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0xf7]
+0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0xf7
+
+# GFX90A: buffer_store_short a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x68,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x2f,0x68,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_short a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x68,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x1f,0x68,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_short a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x01,0x83,0x04]
+0x00,0x00,0x68,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_short a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x01,0x83,0x04]
+0x00,0x00,0x68,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_short a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x68,0xe0,0x00,0x01,0x83,0x04]
+0x07,0x00,0x68,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_short a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x68,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x4f,0x68,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_short a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x6a,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x0f,0x6a,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_short_d16_hi a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0xff,0x83,0x04]
+0xff,0x0f,0x6c,0xe0,0x00,0xff,0x83,0x04
+
+# GFX90A: buffer_store_short_d16_hi a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x84,0x04]
+0xff,0x0f,0x6c,0xe0,0x00,0x01,0x84,0x04
+
+# GFX90A: buffer_store_short_d16_hi a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x98,0x04]
+0xff,0x0f,0x6c,0xe0,0x00,0x01,0x98,0x04
+
+# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x65]
+0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x65
+
+# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x7c]
+0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x7c
+
+# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x80]
+0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x80
+
+# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0xc1]
+0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0xc1
+
+# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0xf0]
+0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0xf0
+
+# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0xf7]
+0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0xf7
+
+# GFX90A: buffer_store_short_d16_hi a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x6c,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x2f,0x6c,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_short_d16_hi a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x6c,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x1f,0x6c,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x6c,0xe0,0x00,0x01,0x83,0x04]
+0x00,0x00,0x6c,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x6c,0xe0,0x00,0x01,0x83,0x04]
+0x00,0x00,0x6c,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x6c,0xe0,0x00,0x01,0x83,0x04]
+0x07,0x00,0x6c,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x6c,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x4f,0x6c,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x6e,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x0f,0x6e,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_dword a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_dword a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0xff,0x83,0x04]
+0xff,0x0f,0x70,0xe0,0x00,0xff,0x83,0x04
+
+# GFX90A: buffer_store_dword a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x84,0x04]
+0xff,0x0f,0x70,0xe0,0x00,0x01,0x84,0x04
+
+# GFX90A: buffer_store_dword a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x98,0x04]
+0xff,0x0f,0x70,0xe0,0x00,0x01,0x98,0x04
+
+# GFX90A: buffer_store_dword a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x65]
+0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x65
+
+# GFX90A: buffer_store_dword a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x7c]
+0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x7c
+
+# GFX90A: buffer_store_dword a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x80]
+0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x80
+
+# GFX90A: buffer_store_dword a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0xc1]
+0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0xc1
+
+# GFX90A: buffer_store_dword a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0xf0]
+0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0xf0
+
+# GFX90A: buffer_store_dword a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0xf7]
+0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0xf7
+
+# GFX90A: buffer_store_dword a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x70,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x2f,0x70,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_dword a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x70,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x1f,0x70,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_dword a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x01,0x83,0x04]
+0x00,0x00,0x70,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_dword a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x01,0x83,0x04]
+0x00,0x00,0x70,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_dword a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x70,0xe0,0x00,0x01,0x83,0x04]
+0x07,0x00,0x70,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_dword a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x70,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x4f,0x70,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_dword a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x72,0xe0,0x00,0x01,0x83,0x04]
+0xff,0x0f,0x72,0xe0,0x00,0x01,0x83,0x04
+
+# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx2 a[254:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0xfe,0x83,0x04]
+0xff,0x0f,0x74,0xe0,0x00,0xfe,0x83,0x04
+
+# GFX90A: buffer_store_dwordx2 a[2:3], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x84,0x04]
+0xff,0x0f,0x74,0xe0,0x00,0x02,0x84,0x04
+
+# GFX90A: buffer_store_dwordx2 a[2:3], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x98,0x04]
+0xff,0x0f,0x74,0xe0,0x00,0x02,0x98,0x04
+
+# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x65]
+0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x65
+
+# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x7c]
+0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x7c
+
+# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x80]
+0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x80
+
+# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0xc1]
+0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0xc1
+
+# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0xf0]
+0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0xf0
+
+# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0xf7]
+0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0xf7
+
+# GFX90A: buffer_store_dwordx2 a[2:3], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x74,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x2f,0x74,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx2 a[2:3], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x74,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x1f,0x74,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x74,0xe0,0x00,0x02,0x83,0x04]
+0x00,0x00,0x74,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x74,0xe0,0x00,0x02,0x83,0x04]
+0x00,0x00,0x74,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x74,0xe0,0x00,0x02,0x83,0x04]
+0x07,0x00,0x74,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x74,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x4f,0x74,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x76,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x0f,0x76,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx3 a[252:254], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0xfc,0x83,0x04]
+0xff,0x0f,0x78,0xe0,0x00,0xfc,0x83,0x04
+
+# GFX90A: buffer_store_dwordx3 a[2:4], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x84,0x04]
+0xff,0x0f,0x78,0xe0,0x00,0x02,0x84,0x04
+
+# GFX90A: buffer_store_dwordx3 a[2:4], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x98,0x04]
+0xff,0x0f,0x78,0xe0,0x00,0x02,0x98,0x04
+
+# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x65]
+0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x65
+
+# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x7c]
+0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x7c
+
+# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x80]
+0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x80
+
+# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0xc1]
+0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0xc1
+
+# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0xf0]
+0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0xf0
+
+# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0xf7]
+0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0xf7
+
+# GFX90A: buffer_store_dwordx3 a[2:4], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x78,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x2f,0x78,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx3 a[2:4], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x78,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x1f,0x78,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 ; encoding: [0x00,0x00,0x78,0xe0,0x00,0x02,0x83,0x04]
+0x00,0x00,0x78,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 ; encoding: [0x00,0x00,0x78,0xe0,0x00,0x02,0x83,0x04]
+0x00,0x00,0x78,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x78,0xe0,0x00,0x02,0x83,0x04]
+0x07,0x00,0x78,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x78,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x4f,0x78,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x7a,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x0f,0x7a,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx4 a[252:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0xfc,0x83,0x04]
+0xff,0x0f,0x7c,0xe0,0x00,0xfc,0x83,0x04
+
+# GFX90A: buffer_store_dwordx4 a[2:5], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x84,0x04]
+0xff,0x0f,0x7c,0xe0,0x00,0x02,0x84,0x04
+
+# GFX90A: buffer_store_dwordx4 a[2:5], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x98,0x04]
+0xff,0x0f,0x7c,0xe0,0x00,0x02,0x98,0x04
+
+# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x65]
+0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x65
+
+# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x7c]
+0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x7c
+
+# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x80]
+0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x80
+
+# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0xc1]
+0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0xc1
+
+# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0xf0]
+0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0xf0
+
+# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0xf7]
+0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0xf7
+
+# GFX90A: buffer_store_dwordx4 a[2:5], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x7c,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x2f,0x7c,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx4 a[2:5], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x7c,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x1f,0x7c,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 ; encoding: [0x00,0x00,0x7c,0xe0,0x00,0x02,0x83,0x04]
+0x00,0x00,0x7c,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 ; encoding: [0x00,0x00,0x7c,0xe0,0x00,0x02,0x83,0x04]
+0x00,0x00,0x7c,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x7c,0xe0,0x00,0x02,0x83,0x04]
+0x07,0x00,0x7c,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x7c,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x4f,0x7c,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x7e,0xe0,0x00,0x02,0x83,0x04]
+0xff,0x0f,0x7e,0xe0,0x00,0x02,0x83,0x04
+
+# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte_d16 a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x80,0xe0,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_load_ubyte_d16 a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x80,0xe0,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_load_ubyte_d16 a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x80,0xe0,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0xf0]
+0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0xf0
+
+# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0xf7]
+0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0xf7
+
+# GFX90A: buffer_load_ubyte_d16 a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x80,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x80,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte_d16 a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x80,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x80,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x80,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x80,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x80,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x80,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x80,0xe0,0x00,0x05,0x82,0x03]
+0x07,0x00,0x80,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x80,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x80,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x82,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x82,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte_d16_hi a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x84,0xe0,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x84,0xe0,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x84,0xe0,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0xf0]
+0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0xf0
+
+# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0xf7]
+0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0xf7
+
+# GFX90A: buffer_load_ubyte_d16_hi a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x84,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x84,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte_d16_hi a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x84,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x84,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x84,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x84,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x84,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x84,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x84,0xe0,0x00,0x05,0x82,0x03]
+0x07,0x00,0x84,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x84,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x84,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x86,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x86,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte_d16 a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x88,0xe0,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_load_sbyte_d16 a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x88,0xe0,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_load_sbyte_d16 a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x88,0xe0,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0xf0]
+0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0xf0
+
+# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0xf7]
+0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0xf7
+
+# GFX90A: buffer_load_sbyte_d16 a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x88,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x88,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte_d16 a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x88,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x88,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x88,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x88,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x88,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x88,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x88,0xe0,0x00,0x05,0x82,0x03]
+0x07,0x00,0x88,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x88,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x88,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x8a,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x8a,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte_d16_hi a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x8c,0xe0,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x8c,0xe0,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x8c,0xe0,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0xf0]
+0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0xf0
+
+# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0xf7]
+0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0xf7
+
+# GFX90A: buffer_load_sbyte_d16_hi a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x8c,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x8c,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte_d16_hi a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x8c,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x8c,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x8c,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x8c,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x8c,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x8c,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x8c,0xe0,0x00,0x05,0x82,0x03]
+0x07,0x00,0x8c,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x8c,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x8c,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x8e,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x8e,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_short_d16 a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x90,0xe0,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_load_short_d16 a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x90,0xe0,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_load_short_d16 a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x90,0xe0,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_load_short_d16 a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_load_short_d16 a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_load_short_d16 a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_load_short_d16 a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_load_short_d16 a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0xf0]
+0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0xf0
+
+# GFX90A: buffer_load_short_d16 a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0xf7]
+0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0xf7
+
+# GFX90A: buffer_load_short_d16 a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x90,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x90,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_short_d16 a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x90,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x90,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x90,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x90,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x90,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x90,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x90,0xe0,0x00,0x05,0x82,0x03]
+0x07,0x00,0x90,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x90,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x90,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x92,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x92,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_short_d16_hi a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x94,0xe0,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_load_short_d16_hi a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x94,0xe0,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_load_short_d16_hi a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x94,0xe0,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0xf0]
+0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0xf0
+
+# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0xf7]
+0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0xf7
+
+# GFX90A: buffer_load_short_d16_hi a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x94,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x94,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_short_d16_hi a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x94,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x94,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x94,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x94,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x94,0xe0,0x00,0x05,0x82,0x03]
+0x00,0x00,0x94,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x94,0xe0,0x00,0x05,0x82,0x03]
+0x07,0x00,0x94,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x94,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x94,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x96,0xe0,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x96,0xe0,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_swap a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x00,0xe1,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_atomic_swap a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x00,0xe1,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_atomic_swap a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x00,0xe1,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_atomic_swap a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_atomic_swap a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_atomic_swap a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_atomic_swap a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_atomic_swap a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x00,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x00,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_swap a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x00,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x00,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x00,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x00,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x00,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x00,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x00,0xe1,0x00,0x05,0x82,0x03]
+0x07,0x00,0x00,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x00,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x00,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x02,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x02,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_cmpswap a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0xfe,0x82,0x03]
+0xff,0x0f,0x04,0xe1,0x00,0xfe,0x82,0x03
+
+# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x83,0x03]
+0xff,0x0f,0x04,0xe1,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x98,0x03]
+0xff,0x0f,0x04,0xe1,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x65]
+0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x80]
+0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_atomic_cmpswap a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x04,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x2f,0x04,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_cmpswap a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x04,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x1f,0x04,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x04,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0x04,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x04,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0x04,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x04,0xe1,0x00,0x06,0x82,0x03]
+0x07,0x00,0x04,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x04,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x4f,0x04,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x06,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x06,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_add a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_add a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x08,0xe1,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_atomic_add a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x08,0xe1,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_atomic_add a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x08,0xe1,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_atomic_add a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_atomic_add a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_atomic_add a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_atomic_add a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_atomic_add a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x08,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x08,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_add a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x08,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x08,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_add a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x08,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x08,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_add a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x08,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x08,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_add a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x08,0xe1,0x00,0x05,0x82,0x03]
+0x07,0x00,0x08,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_add a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x08,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x08,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_add a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x0a,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x0a,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_sub a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x0c,0xe1,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_atomic_sub a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x0c,0xe1,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_atomic_sub a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x0c,0xe1,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_atomic_sub a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_atomic_sub a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_atomic_sub a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_atomic_sub a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_atomic_sub a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x0c,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x0c,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_sub a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x0c,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x0c,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x0c,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x0c,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x0c,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x0c,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x0c,0xe1,0x00,0x05,0x82,0x03]
+0x07,0x00,0x0c,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x0c,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x0c,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x0e,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x0e,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_smin a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x10,0xe1,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_atomic_smin a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x10,0xe1,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_atomic_smin a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x10,0xe1,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_atomic_smin a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_atomic_smin a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_atomic_smin a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_atomic_smin a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_atomic_smin a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x10,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x10,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_smin a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x10,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x10,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x10,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x10,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x10,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x10,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x10,0xe1,0x00,0x05,0x82,0x03]
+0x07,0x00,0x10,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x10,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x10,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x12,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x12,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_umin a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x14,0xe1,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_atomic_umin a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x14,0xe1,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_atomic_umin a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x14,0xe1,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_atomic_umin a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_atomic_umin a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_atomic_umin a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_atomic_umin a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_atomic_umin a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x14,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x14,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_umin a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x14,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x14,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x14,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x14,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x14,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x14,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x14,0xe1,0x00,0x05,0x82,0x03]
+0x07,0x00,0x14,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x14,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x14,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x16,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x16,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_smax a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x18,0xe1,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_atomic_smax a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x18,0xe1,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_atomic_smax a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x18,0xe1,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_atomic_smax a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_atomic_smax a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_atomic_smax a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_atomic_smax a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_atomic_smax a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x18,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x18,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_smax a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x18,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x18,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x18,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x18,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x18,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x18,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x18,0xe1,0x00,0x05,0x82,0x03]
+0x07,0x00,0x18,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x18,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x18,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x1a,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x1a,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_umax a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x1c,0xe1,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_atomic_umax a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x1c,0xe1,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_atomic_umax a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x1c,0xe1,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_atomic_umax a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_atomic_umax a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_atomic_umax a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_atomic_umax a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_atomic_umax a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x1c,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x1c,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_umax a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x1c,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x1c,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x1c,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x1c,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x1c,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x1c,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x1c,0xe1,0x00,0x05,0x82,0x03]
+0x07,0x00,0x1c,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x1c,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x1c,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x1e,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x1e,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_and a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_and a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x20,0xe1,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_atomic_and a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x20,0xe1,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_atomic_and a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x20,0xe1,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_atomic_and a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_atomic_and a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_atomic_and a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_atomic_and a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_atomic_and a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x20,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x20,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_and a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x20,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x20,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_and a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x20,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x20,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_and a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x20,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x20,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_and a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x20,0xe1,0x00,0x05,0x82,0x03]
+0x07,0x00,0x20,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_and a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x20,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x20,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_and a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x22,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x22,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_or a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_or a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x24,0xe1,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_atomic_or a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x24,0xe1,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_atomic_or a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x24,0xe1,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_atomic_or a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_atomic_or a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_atomic_or a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_atomic_or a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_atomic_or a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x24,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x24,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_or a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x24,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x24,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_or a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x24,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x24,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_or a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x24,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x24,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_or a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x24,0xe1,0x00,0x05,0x82,0x03]
+0x07,0x00,0x24,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_or a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x24,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x24,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_or a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x26,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x26,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_xor a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x28,0xe1,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_atomic_xor a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x28,0xe1,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_atomic_xor a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x28,0xe1,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_atomic_xor a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_atomic_xor a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_atomic_xor a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_atomic_xor a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_atomic_xor a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x28,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x28,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_xor a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x28,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x28,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x28,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x28,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x28,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x28,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x28,0xe1,0x00,0x05,0x82,0x03]
+0x07,0x00,0x28,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x28,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x28,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x2a,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x2a,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_inc a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x2c,0xe1,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_atomic_inc a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x2c,0xe1,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_atomic_inc a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x2c,0xe1,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_atomic_inc a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_atomic_inc a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_atomic_inc a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_atomic_inc a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_atomic_inc a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x2c,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x2c,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_inc a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x2c,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x2c,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x2c,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x2c,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x2c,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x2c,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x2c,0xe1,0x00,0x05,0x82,0x03]
+0x07,0x00,0x2c,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x2c,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x2c,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x2e,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x2e,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_dec a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0xff,0x82,0x03]
+0xff,0x0f,0x30,0xe1,0x00,0xff,0x82,0x03
+
+# GFX90A: buffer_atomic_dec a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x83,0x03]
+0xff,0x0f,0x30,0xe1,0x00,0x05,0x83,0x03
+
+# GFX90A: buffer_atomic_dec a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x98,0x03]
+0xff,0x0f,0x30,0xe1,0x00,0x05,0x98,0x03
+
+# GFX90A: buffer_atomic_dec a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x65]
+0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x65
+
+# GFX90A: buffer_atomic_dec a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x7c]
+0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x7c
+
+# GFX90A: buffer_atomic_dec a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x80]
+0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x80
+
+# GFX90A: buffer_atomic_dec a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0xc1]
+0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0xc1
+
+# GFX90A: buffer_atomic_dec a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x30,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x2f,0x30,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_dec a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x30,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x1f,0x30,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x30,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x30,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x30,0xe1,0x00,0x05,0x82,0x03]
+0x00,0x00,0x30,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x30,0xe1,0x00,0x05,0x82,0x03]
+0x07,0x00,0x30,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x30,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x4f,0x30,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x32,0xe1,0x00,0x05,0x82,0x03]
+0xff,0x0f,0x32,0xe1,0x00,0x05,0x82,0x03
+
+# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_swap_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0xfe,0x82,0x03]
+0xff,0x0f,0x80,0xe1,0x00,0xfe,0x82,0x03
+
+# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x83,0x03]
+0xff,0x0f,0x80,0xe1,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x98,0x03]
+0xff,0x0f,0x80,0xe1,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x65]
+0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x80]
+0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_atomic_swap_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x80,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x2f,0x80,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_swap_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x80,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x1f,0x80,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x80,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0x80,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x80,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0x80,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x80,0xe1,0x00,0x06,0x82,0x03]
+0x07,0x00,0x80,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x80,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x4f,0x80,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x82,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x82,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_cmpswap_x2 a[252:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0xfc,0x82,0x03]
+0xff,0x0f,0x84,0xe1,0x00,0xfc,0x82,0x03
+
+# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x83,0x03]
+0xff,0x0f,0x84,0xe1,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x98,0x03]
+0xff,0x0f,0x84,0xe1,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x65]
+0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x80]
+0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x84,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x2f,0x84,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x84,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x1f,0x84,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x84,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0x84,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x84,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0x84,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x84,0xe1,0x00,0x06,0x82,0x03]
+0x07,0x00,0x84,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x84,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x4f,0x84,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x86,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x86,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_add_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0xfe,0x82,0x03]
+0xff,0x0f,0x88,0xe1,0x00,0xfe,0x82,0x03
+
+# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x83,0x03]
+0xff,0x0f,0x88,0xe1,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x98,0x03]
+0xff,0x0f,0x88,0xe1,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x65]
+0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x80]
+0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_atomic_add_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x88,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x2f,0x88,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_add_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x88,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x1f,0x88,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x88,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0x88,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x88,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0x88,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x88,0xe1,0x00,0x06,0x82,0x03]
+0x07,0x00,0x88,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x88,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x4f,0x88,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x8a,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x8a,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_sub_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0xfe,0x82,0x03]
+0xff,0x0f,0x8c,0xe1,0x00,0xfe,0x82,0x03
+
+# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x83,0x03]
+0xff,0x0f,0x8c,0xe1,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x98,0x03]
+0xff,0x0f,0x8c,0xe1,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x65]
+0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x80]
+0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_atomic_sub_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x8c,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x2f,0x8c,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_sub_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x8c,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x1f,0x8c,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x8c,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0x8c,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x8c,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0x8c,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x8c,0xe1,0x00,0x06,0x82,0x03]
+0x07,0x00,0x8c,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x8c,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x4f,0x8c,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x8e,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x8e,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_smin_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0xfe,0x82,0x03]
+0xff,0x0f,0x90,0xe1,0x00,0xfe,0x82,0x03
+
+# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x83,0x03]
+0xff,0x0f,0x90,0xe1,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x98,0x03]
+0xff,0x0f,0x90,0xe1,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x65]
+0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x80]
+0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_atomic_smin_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x90,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x2f,0x90,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_smin_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x90,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x1f,0x90,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x90,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0x90,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x90,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0x90,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x90,0xe1,0x00,0x06,0x82,0x03]
+0x07,0x00,0x90,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x90,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x4f,0x90,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x92,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x92,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_umin_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0xfe,0x82,0x03]
+0xff,0x0f,0x94,0xe1,0x00,0xfe,0x82,0x03
+
+# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x83,0x03]
+0xff,0x0f,0x94,0xe1,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x98,0x03]
+0xff,0x0f,0x94,0xe1,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x65]
+0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x80]
+0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_atomic_umin_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x94,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x2f,0x94,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_umin_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x94,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x1f,0x94,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x94,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0x94,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x94,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0x94,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x94,0xe1,0x00,0x06,0x82,0x03]
+0x07,0x00,0x94,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x94,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x4f,0x94,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x96,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x96,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_smax_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0xfe,0x82,0x03]
+0xff,0x0f,0x98,0xe1,0x00,0xfe,0x82,0x03
+
+# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x83,0x03]
+0xff,0x0f,0x98,0xe1,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x98,0x03]
+0xff,0x0f,0x98,0xe1,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x65]
+0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x80]
+0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_atomic_smax_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x98,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x2f,0x98,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_smax_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x98,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x1f,0x98,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x98,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0x98,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x98,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0x98,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x98,0xe1,0x00,0x06,0x82,0x03]
+0x07,0x00,0x98,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x98,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x4f,0x98,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x9a,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x9a,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_umax_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0xfe,0x82,0x03]
+0xff,0x0f,0x9c,0xe1,0x00,0xfe,0x82,0x03
+
+# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x83,0x03]
+0xff,0x0f,0x9c,0xe1,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x98,0x03]
+0xff,0x0f,0x9c,0xe1,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x65]
+0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x80]
+0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_atomic_umax_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x9c,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x2f,0x9c,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_umax_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x9c,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x1f,0x9c,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x9c,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0x9c,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x9c,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0x9c,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x9c,0xe1,0x00,0x06,0x82,0x03]
+0x07,0x00,0x9c,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x9c,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x4f,0x9c,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x9e,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0x9e,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_and_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0xfe,0x82,0x03]
+0xff,0x0f,0xa0,0xe1,0x00,0xfe,0x82,0x03
+
+# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x83,0x03]
+0xff,0x0f,0xa0,0xe1,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x98,0x03]
+0xff,0x0f,0xa0,0xe1,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x65]
+0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x80]
+0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_atomic_and_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0xa0,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x2f,0xa0,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_and_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0xa0,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x1f,0xa0,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa0,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0xa0,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa0,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0xa0,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0xa0,0xe1,0x00,0x06,0x82,0x03]
+0x07,0x00,0xa0,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xa0,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x4f,0xa0,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0xa2,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0xa2,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_or_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0xfe,0x82,0x03]
+0xff,0x0f,0xa4,0xe1,0x00,0xfe,0x82,0x03
+
+# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x83,0x03]
+0xff,0x0f,0xa4,0xe1,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x98,0x03]
+0xff,0x0f,0xa4,0xe1,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x65]
+0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x80]
+0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_atomic_or_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0xa4,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x2f,0xa4,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_or_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0xa4,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x1f,0xa4,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa4,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0xa4,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa4,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0xa4,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0xa4,0xe1,0x00,0x06,0x82,0x03]
+0x07,0x00,0xa4,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xa4,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x4f,0xa4,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0xa6,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0xa6,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_xor_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0xfe,0x82,0x03]
+0xff,0x0f,0xa8,0xe1,0x00,0xfe,0x82,0x03
+
+# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x83,0x03]
+0xff,0x0f,0xa8,0xe1,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x98,0x03]
+0xff,0x0f,0xa8,0xe1,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x65]
+0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x80]
+0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_atomic_xor_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0xa8,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x2f,0xa8,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_xor_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0xa8,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x1f,0xa8,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa8,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0xa8,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa8,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0xa8,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0xa8,0xe1,0x00,0x06,0x82,0x03]
+0x07,0x00,0xa8,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xa8,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x4f,0xa8,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0xaa,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0xaa,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_inc_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0xfe,0x82,0x03]
+0xff,0x0f,0xac,0xe1,0x00,0xfe,0x82,0x03
+
+# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x83,0x03]
+0xff,0x0f,0xac,0xe1,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x98,0x03]
+0xff,0x0f,0xac,0xe1,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x65]
+0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x80]
+0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_atomic_inc_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0xac,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x2f,0xac,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_inc_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0xac,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x1f,0xac,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xac,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0xac,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xac,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0xac,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0xac,0xe1,0x00,0x06,0x82,0x03]
+0x07,0x00,0xac,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xac,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x4f,0xac,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0xae,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0xae,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_dec_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0xfe,0x82,0x03]
+0xff,0x0f,0xb0,0xe1,0x00,0xfe,0x82,0x03
+
+# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x83,0x03]
+0xff,0x0f,0xb0,0xe1,0x00,0x06,0x83,0x03
+
+# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x98,0x03]
+0xff,0x0f,0xb0,0xe1,0x00,0x06,0x98,0x03
+
+# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x65]
+0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x65
+
+# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x7c]
+0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x7c
+
+# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x80]
+0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x80
+
+# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0xc1]
+0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0xc1
+
+# GFX90A: buffer_atomic_dec_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0xb0,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x2f,0xb0,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_dec_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0xb0,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x1f,0xb0,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xb0,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0xb0,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xb0,0xe1,0x00,0x06,0x82,0x03]
+0x00,0x00,0xb0,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0xb0,0xe1,0x00,0x06,0x82,0x03]
+0x07,0x00,0xb0,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xb0,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x4f,0xb0,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0xb2,0xe1,0x00,0x06,0x82,0x03]
+0xff,0x0f,0xb2,0xe1,0x00,0x06,0x82,0x03
+
+# GFX90A: tbuffer_load_format_x a1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x78,0xe9,0x00,0x01,0x81,0x01]
+0x00,0x00,0x78,0xe9,0x00,0x01,0x81,0x01
+
+# GFX90A: tbuffer_load_format_xy a[2:3], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x78,0xe9,0x00,0x02,0x81,0x01]
+0x00,0x80,0x78,0xe9,0x00,0x02,0x81,0x01
+
+# GFX90A: tbuffer_load_format_xyz a[2:4], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x79,0xe9,0x00,0x02,0x81,0x01]
+0x00,0x00,0x79,0xe9,0x00,0x02,0x81,0x01
+
+# GFX90A: tbuffer_load_format_xyzw a[2:5], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x79,0xe9,0x00,0x02,0x81,0x01]
+0x00,0x80,0x79,0xe9,0x00,0x02,0x81,0x01
+
+# GFX90A: tbuffer_store_format_x a1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7a,0xe9,0x00,0x01,0x81,0x01]
+0x00,0x00,0x7a,0xe9,0x00,0x01,0x81,0x01
+
+# GFX90A: tbuffer_store_format_xy a[2:3], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7a,0xe9,0x00,0x02,0x81,0x01]
+0x00,0x80,0x7a,0xe9,0x00,0x02,0x81,0x01
+
+# GFX90A: tbuffer_store_format_xyzw a[2:5], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x02,0x81,0x01]
+0x00,0x80,0x7b,0xe9,0x00,0x02,0x81,0x01
+
+# GFX90A: tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x02,0x9c,0x6d]
+0x00,0x80,0x7b,0xe9,0x00,0x02,0x9c,0x6d
+
+# GFX90A: tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15] ; encoding: [0x00,0x80,0x7b,0xe8,0x00,0x02,0x9c,0x6d]
+0x00,0x80,0x7b,0xe8,0x00,0x02,0x9c,0x6d
+
+# GFX90A: tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x03,0xe9,0x00,0x02,0x9c,0x6d]
+0x00,0x80,0x03,0xe9,0x00,0x02,0x9c,0x6d
+
+# GFX90A: tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x02,0x9c,0x6d]
+0x00,0x80,0x7b,0xe9,0x00,0x02,0x9c,0x6d
+
+# GFX90A: ds_add_u32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x00,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x00,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_add_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x00,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x00,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_add_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x00,0xda,0x01,0xff,0x00,0x00]
+0xff,0xff,0x00,0xda,0x01,0xff,0x00,0x00
+
+# GFX90A: ds_add_u32 v1, a2               ; encoding: [0x00,0x00,0x00,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x00,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_add_u32 v1, a2               ; encoding: [0x00,0x00,0x00,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x00,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_add_u32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x00,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x00,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_add_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x01,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x01,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_sub_u32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x02,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x02,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_sub_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x02,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x02,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_sub_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x02,0xda,0x01,0xff,0x00,0x00]
+0xff,0xff,0x02,0xda,0x01,0xff,0x00,0x00
+
+# GFX90A: ds_sub_u32 v1, a2               ; encoding: [0x00,0x00,0x02,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x02,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_sub_u32 v1, a2               ; encoding: [0x00,0x00,0x02,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x02,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_sub_u32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x02,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x02,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_sub_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x03,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x03,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_rsub_u32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x04,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x04,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_rsub_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x04,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x04,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_rsub_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x04,0xda,0x01,0xff,0x00,0x00]
+0xff,0xff,0x04,0xda,0x01,0xff,0x00,0x00
+
+# GFX90A: ds_rsub_u32 v1, a2              ; encoding: [0x00,0x00,0x04,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x04,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_rsub_u32 v1, a2              ; encoding: [0x00,0x00,0x04,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x04,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_rsub_u32 v1, a2 offset:4     ; encoding: [0x04,0x00,0x04,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x04,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_rsub_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x05,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x05,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_inc_u32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x06,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x06,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_inc_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x06,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x06,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_inc_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x06,0xda,0x01,0xff,0x00,0x00]
+0xff,0xff,0x06,0xda,0x01,0xff,0x00,0x00
+
+# GFX90A: ds_inc_u32 v1, a2               ; encoding: [0x00,0x00,0x06,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x06,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_inc_u32 v1, a2               ; encoding: [0x00,0x00,0x06,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x06,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_inc_u32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x06,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x06,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_inc_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x07,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x07,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_dec_u32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x08,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x08,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_dec_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x08,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x08,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_dec_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x08,0xda,0x01,0xff,0x00,0x00]
+0xff,0xff,0x08,0xda,0x01,0xff,0x00,0x00
+
+# GFX90A: ds_dec_u32 v1, a2               ; encoding: [0x00,0x00,0x08,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x08,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_dec_u32 v1, a2               ; encoding: [0x00,0x00,0x08,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x08,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_dec_u32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x08,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x08,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_dec_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x09,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x09,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_i32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x0a,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x0a,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_i32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x0a,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x0a,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_min_i32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x0a,0xda,0x01,0xff,0x00,0x00]
+0xff,0xff,0x0a,0xda,0x01,0xff,0x00,0x00
+
+# GFX90A: ds_min_i32 v1, a2               ; encoding: [0x00,0x00,0x0a,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x0a,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_i32 v1, a2               ; encoding: [0x00,0x00,0x0a,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x0a,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_i32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x0a,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x0a,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_i32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x0b,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x0b,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_i32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x0c,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x0c,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_i32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x0c,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x0c,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_max_i32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x0c,0xda,0x01,0xff,0x00,0x00]
+0xff,0xff,0x0c,0xda,0x01,0xff,0x00,0x00
+
+# GFX90A: ds_max_i32 v1, a2               ; encoding: [0x00,0x00,0x0c,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x0c,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_i32 v1, a2               ; encoding: [0x00,0x00,0x0c,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x0c,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_i32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x0c,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x0c,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_i32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x0d,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x0d,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_u32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x0e,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x0e,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x0e,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x0e,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_min_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x0e,0xda,0x01,0xff,0x00,0x00]
+0xff,0xff,0x0e,0xda,0x01,0xff,0x00,0x00
+
+# GFX90A: ds_min_u32 v1, a2               ; encoding: [0x00,0x00,0x0e,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x0e,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_u32 v1, a2               ; encoding: [0x00,0x00,0x0e,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x0e,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_u32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x0e,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x0e,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x0f,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x0f,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_u32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x10,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x10,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x10,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x10,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_max_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x10,0xda,0x01,0xff,0x00,0x00]
+0xff,0xff,0x10,0xda,0x01,0xff,0x00,0x00
+
+# GFX90A: ds_max_u32 v1, a2               ; encoding: [0x00,0x00,0x10,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x10,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_u32 v1, a2               ; encoding: [0x00,0x00,0x10,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x10,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_u32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x10,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x10,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x11,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x11,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_and_b32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x12,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x12,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_and_b32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x12,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x12,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_and_b32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x12,0xda,0x01,0xff,0x00,0x00]
+0xff,0xff,0x12,0xda,0x01,0xff,0x00,0x00
+
+# GFX90A: ds_and_b32 v1, a2               ; encoding: [0x00,0x00,0x12,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x12,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_and_b32 v1, a2               ; encoding: [0x00,0x00,0x12,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x12,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_and_b32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x12,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x12,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_and_b32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x13,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x13,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_or_b32 v1, a2 offset:65535   ; encoding: [0xff,0xff,0x14,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x14,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_or_b32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x14,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x14,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_or_b32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x14,0xda,0x01,0xff,0x00,0x00]
+0xff,0xff,0x14,0xda,0x01,0xff,0x00,0x00
+
+# GFX90A: ds_or_b32 v1, a2                ; encoding: [0x00,0x00,0x14,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x14,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_or_b32 v1, a2                ; encoding: [0x00,0x00,0x14,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x14,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_or_b32 v1, a2 offset:4       ; encoding: [0x04,0x00,0x14,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x14,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_or_b32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x15,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x15,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_xor_b32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x16,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x16,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_xor_b32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x16,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x16,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_xor_b32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x16,0xda,0x01,0xff,0x00,0x00]
+0xff,0xff,0x16,0xda,0x01,0xff,0x00,0x00
+
+# GFX90A: ds_xor_b32 v1, a2               ; encoding: [0x00,0x00,0x16,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x16,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_xor_b32 v1, a2               ; encoding: [0x00,0x00,0x16,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x16,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_xor_b32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x16,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x16,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_xor_b32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x17,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x17,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_mskor_b32 v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x18,0xda,0x01,0x02,0x03,0x00]
+0xff,0xff,0x18,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_mskor_b32 v255, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x18,0xda,0xff,0x02,0x03,0x00]
+0xff,0xff,0x18,0xda,0xff,0x02,0x03,0x00
+
+# GFX90A: ds_mskor_b32 v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x18,0xda,0x01,0xff,0x03,0x00]
+0xff,0xff,0x18,0xda,0x01,0xff,0x03,0x00
+
+# GFX90A: ds_mskor_b32 v1, a2, a255 offset:65535 ; encoding: [0xff,0xff,0x18,0xda,0x01,0x02,0xff,0x00]
+0xff,0xff,0x18,0xda,0x01,0x02,0xff,0x00
+
+# GFX90A: ds_mskor_b32 v1, a2, a3         ; encoding: [0x00,0x00,0x18,0xda,0x01,0x02,0x03,0x00]
+0x00,0x00,0x18,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_mskor_b32 v1, a2, a3         ; encoding: [0x00,0x00,0x18,0xda,0x01,0x02,0x03,0x00]
+0x00,0x00,0x18,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_mskor_b32 v1, a2, a3 offset:4 ; encoding: [0x04,0x00,0x18,0xda,0x01,0x02,0x03,0x00]
+0x04,0x00,0x18,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_mskor_b32 v1, a2, a3 offset:65535 gds ; encoding: [0xff,0xff,0x19,0xda,0x01,0x02,0x03,0x00]
+0xff,0xff,0x19,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_write_b32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x1a,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x1a,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x1a,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x1a,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_write_b32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x1a,0xda,0x01,0xff,0x00,0x00]
+0xff,0xff,0x1a,0xda,0x01,0xff,0x00,0x00
+
+# GFX90A: ds_write_b32 v1, a2             ; encoding: [0x00,0x00,0x1a,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x1a,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b32 v1, a2             ; encoding: [0x00,0x00,0x1a,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x1a,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b32 v1, a2 offset:4    ; encoding: [0x04,0x00,0x1a,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x1a,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x1b,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x1b,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write2_b32 v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1c,0xda,0x01,0x02,0x03,0x00]
+0x7f,0xff,0x1c,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_write2_b32 v255, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1c,0xda,0xff,0x02,0x03,0x00]
+0x7f,0xff,0x1c,0xda,0xff,0x02,0x03,0x00
+
+# GFX90A: ds_write2_b32 v1, a255, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1c,0xda,0x01,0xff,0x03,0x00]
+0x7f,0xff,0x1c,0xda,0x01,0xff,0x03,0x00
+
+# GFX90A: ds_write2_b32 v1, a2, a255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1c,0xda,0x01,0x02,0xff,0x00]
+0x7f,0xff,0x1c,0xda,0x01,0x02,0xff,0x00
+
+# GFX90A: ds_write2_b32 v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x1c,0xda,0x01,0x02,0x03,0x00]
+0x00,0xff,0x1c,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_write2_b32 v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x1c,0xda,0x01,0x02,0x03,0x00]
+0x00,0xff,0x1c,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_write2_b32 v1, a2, a3 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x1c,0xda,0x01,0x02,0x03,0x00]
+0x10,0xff,0x1c,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_write2_b32 v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x1c,0xda,0x01,0x02,0x03,0x00]
+0x7f,0x00,0x1c,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_write2_b32 v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x1c,0xda,0x01,0x02,0x03,0x00]
+0x7f,0x00,0x1c,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_write2_b32 v1, a2, a3 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x1c,0xda,0x01,0x02,0x03,0x00]
+0x7f,0x01,0x1c,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_write2_b32 v1, a2, a3 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x1d,0xda,0x01,0x02,0x03,0x00]
+0x7f,0xff,0x1d,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1e,0xda,0x01,0x02,0x03,0x00]
+0x7f,0xff,0x1e,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_write2st64_b32 v255, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1e,0xda,0xff,0x02,0x03,0x00]
+0x7f,0xff,0x1e,0xda,0xff,0x02,0x03,0x00
+
+# GFX90A: ds_write2st64_b32 v1, a255, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1e,0xda,0x01,0xff,0x03,0x00]
+0x7f,0xff,0x1e,0xda,0x01,0xff,0x03,0x00
+
+# GFX90A: ds_write2st64_b32 v1, a2, a255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1e,0xda,0x01,0x02,0xff,0x00]
+0x7f,0xff,0x1e,0xda,0x01,0x02,0xff,0x00
+
+# GFX90A: ds_write2st64_b32 v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x1e,0xda,0x01,0x02,0x03,0x00]
+0x00,0xff,0x1e,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_write2st64_b32 v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x1e,0xda,0x01,0x02,0x03,0x00]
+0x00,0xff,0x1e,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x1e,0xda,0x01,0x02,0x03,0x00]
+0x10,0xff,0x1e,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x1e,0xda,0x01,0x02,0x03,0x00]
+0x7f,0x00,0x1e,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x1e,0xda,0x01,0x02,0x03,0x00]
+0x7f,0x00,0x1e,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x1e,0xda,0x01,0x02,0x03,0x00]
+0x7f,0x01,0x1e,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x1f,0xda,0x01,0x02,0x03,0x00]
+0x7f,0xff,0x1f,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_cmpst_b32 v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x20,0xda,0x01,0x02,0x03,0x00]
+0xff,0xff,0x20,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_cmpst_b32 v255, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x20,0xda,0xff,0x02,0x03,0x00]
+0xff,0xff,0x20,0xda,0xff,0x02,0x03,0x00
+
+# GFX90A: ds_cmpst_b32 v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x20,0xda,0x01,0xff,0x03,0x00]
+0xff,0xff,0x20,0xda,0x01,0xff,0x03,0x00
+
+# GFX90A: ds_cmpst_b32 v1, a2, a255 offset:65535 ; encoding: [0xff,0xff,0x20,0xda,0x01,0x02,0xff,0x00]
+0xff,0xff,0x20,0xda,0x01,0x02,0xff,0x00
+
+# GFX90A: ds_cmpst_b32 v1, a2, a3         ; encoding: [0x00,0x00,0x20,0xda,0x01,0x02,0x03,0x00]
+0x00,0x00,0x20,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_cmpst_b32 v1, a2, a3         ; encoding: [0x00,0x00,0x20,0xda,0x01,0x02,0x03,0x00]
+0x00,0x00,0x20,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_cmpst_b32 v1, a2, a3 offset:4 ; encoding: [0x04,0x00,0x20,0xda,0x01,0x02,0x03,0x00]
+0x04,0x00,0x20,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_cmpst_b32 v1, a2, a3 offset:65535 gds ; encoding: [0xff,0xff,0x21,0xda,0x01,0x02,0x03,0x00]
+0xff,0xff,0x21,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_cmpst_f32 v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x22,0xda,0x01,0x02,0x03,0x00]
+0xff,0xff,0x22,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_cmpst_f32 v255, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x22,0xda,0xff,0x02,0x03,0x00]
+0xff,0xff,0x22,0xda,0xff,0x02,0x03,0x00
+
+# GFX90A: ds_cmpst_f32 v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x22,0xda,0x01,0xff,0x03,0x00]
+0xff,0xff,0x22,0xda,0x01,0xff,0x03,0x00
+
+# GFX90A: ds_cmpst_f32 v1, a2, a255 offset:65535 ; encoding: [0xff,0xff,0x22,0xda,0x01,0x02,0xff,0x00]
+0xff,0xff,0x22,0xda,0x01,0x02,0xff,0x00
+
+# GFX90A: ds_cmpst_f32 v1, a2, a3         ; encoding: [0x00,0x00,0x22,0xda,0x01,0x02,0x03,0x00]
+0x00,0x00,0x22,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_cmpst_f32 v1, a2, a3         ; encoding: [0x00,0x00,0x22,0xda,0x01,0x02,0x03,0x00]
+0x00,0x00,0x22,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_cmpst_f32 v1, a2, a3 offset:4 ; encoding: [0x04,0x00,0x22,0xda,0x01,0x02,0x03,0x00]
+0x04,0x00,0x22,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_cmpst_f32 v1, a2, a3 offset:65535 gds ; encoding: [0xff,0xff,0x23,0xda,0x01,0x02,0x03,0x00]
+0xff,0xff,0x23,0xda,0x01,0x02,0x03,0x00
+
+# GFX90A: ds_min_f32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x24,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x24,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_f32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x24,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x24,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_min_f32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x24,0xda,0x01,0xff,0x00,0x00]
+0xff,0xff,0x24,0xda,0x01,0xff,0x00,0x00
+
+# GFX90A: ds_min_f32 v1, a2               ; encoding: [0x00,0x00,0x24,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x24,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_f32 v1, a2               ; encoding: [0x00,0x00,0x24,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x24,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_f32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x24,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x24,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_f32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x25,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x25,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_f32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x26,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x26,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_f32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x26,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x26,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_max_f32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x26,0xda,0x01,0xff,0x00,0x00]
+0xff,0xff,0x26,0xda,0x01,0xff,0x00,0x00
+
+# GFX90A: ds_max_f32 v1, a2               ; encoding: [0x00,0x00,0x26,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x26,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_f32 v1, a2               ; encoding: [0x00,0x00,0x26,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x26,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_f32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x26,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x26,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_f32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x27,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x27,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_add_f32 v1, a2 offset:65535  ; encoding: [0xff,0xff,0x2a,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x2a,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_add_f32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x2a,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x2a,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_add_f32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x2a,0xda,0x01,0xff,0x00,0x00]
+0xff,0xff,0x2a,0xda,0x01,0xff,0x00,0x00
+
+# GFX90A: ds_add_f32 v1, a2               ; encoding: [0x00,0x00,0x2a,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x2a,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_add_f32 v1, a2               ; encoding: [0x00,0x00,0x2a,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x2a,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_add_f32 v1, a2 offset:4      ; encoding: [0x04,0x00,0x2a,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x2a,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_add_f32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x2b,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x2b,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b8 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x3c,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x3c,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b8 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x3c,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x3c,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_write_b8 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x3c,0xda,0x01,0xff,0x00,0x00]
+0xff,0xff,0x3c,0xda,0x01,0xff,0x00,0x00
+
+# GFX90A: ds_write_b8 v1, a2              ; encoding: [0x00,0x00,0x3c,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x3c,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b8 v1, a2              ; encoding: [0x00,0x00,0x3c,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x3c,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b8 v1, a2 offset:4     ; encoding: [0x04,0x00,0x3c,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x3c,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b8 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x3d,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x3d,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b16 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x3e,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x3e,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b16 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x3e,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x3e,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_write_b16 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x3e,0xda,0x01,0xff,0x00,0x00]
+0xff,0xff,0x3e,0xda,0x01,0xff,0x00,0x00
+
+# GFX90A: ds_write_b16 v1, a2             ; encoding: [0x00,0x00,0x3e,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x3e,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b16 v1, a2             ; encoding: [0x00,0x00,0x3e,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x3e,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b16 v1, a2 offset:4    ; encoding: [0x04,0x00,0x3e,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x3e,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b16 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x3f,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x3f,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_add_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x40,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x40,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_add_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x40,0xda,0x01,0x02,0x00,0xff]
+0xff,0xff,0x40,0xda,0x01,0x02,0x00,0xff
+
+# GFX90A: ds_add_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x40,0xda,0xff,0x02,0x00,0x05]
+0xff,0xff,0x40,0xda,0xff,0x02,0x00,0x05
+
+# GFX90A: ds_add_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x40,0xda,0x01,0xff,0x00,0x05]
+0xff,0xff,0x40,0xda,0x01,0xff,0x00,0x05
+
+# GFX90A: ds_add_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x40,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x40,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_add_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x40,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x40,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_add_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x40,0xda,0x01,0x02,0x00,0x05]
+0x04,0x00,0x40,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_add_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x41,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x41,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_sub_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x42,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x42,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_sub_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x42,0xda,0x01,0x02,0x00,0xff]
+0xff,0xff,0x42,0xda,0x01,0x02,0x00,0xff
+
+# GFX90A: ds_sub_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x42,0xda,0xff,0x02,0x00,0x05]
+0xff,0xff,0x42,0xda,0xff,0x02,0x00,0x05
+
+# GFX90A: ds_sub_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x42,0xda,0x01,0xff,0x00,0x05]
+0xff,0xff,0x42,0xda,0x01,0xff,0x00,0x05
+
+# GFX90A: ds_sub_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x42,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x42,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_sub_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x42,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x42,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_sub_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x42,0xda,0x01,0x02,0x00,0x05]
+0x04,0x00,0x42,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_sub_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x43,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x43,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_rsub_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x44,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x44,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_rsub_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x44,0xda,0x01,0x02,0x00,0xff]
+0xff,0xff,0x44,0xda,0x01,0x02,0x00,0xff
+
+# GFX90A: ds_rsub_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x44,0xda,0xff,0x02,0x00,0x05]
+0xff,0xff,0x44,0xda,0xff,0x02,0x00,0x05
+
+# GFX90A: ds_rsub_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x44,0xda,0x01,0xff,0x00,0x05]
+0xff,0xff,0x44,0xda,0x01,0xff,0x00,0x05
+
+# GFX90A: ds_rsub_rtn_u32 a5, v1, a2      ; encoding: [0x00,0x00,0x44,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x44,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_rsub_rtn_u32 a5, v1, a2      ; encoding: [0x00,0x00,0x44,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x44,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_rsub_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x44,0xda,0x01,0x02,0x00,0x05]
+0x04,0x00,0x44,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_rsub_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x45,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x45,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_inc_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x46,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x46,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_inc_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x46,0xda,0x01,0x02,0x00,0xff]
+0xff,0xff,0x46,0xda,0x01,0x02,0x00,0xff
+
+# GFX90A: ds_inc_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x46,0xda,0xff,0x02,0x00,0x05]
+0xff,0xff,0x46,0xda,0xff,0x02,0x00,0x05
+
+# GFX90A: ds_inc_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x46,0xda,0x01,0xff,0x00,0x05]
+0xff,0xff,0x46,0xda,0x01,0xff,0x00,0x05
+
+# GFX90A: ds_inc_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x46,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x46,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_inc_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x46,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x46,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_inc_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x46,0xda,0x01,0x02,0x00,0x05]
+0x04,0x00,0x46,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_inc_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x47,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x47,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_dec_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x48,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x48,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_dec_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x48,0xda,0x01,0x02,0x00,0xff]
+0xff,0xff,0x48,0xda,0x01,0x02,0x00,0xff
+
+# GFX90A: ds_dec_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x48,0xda,0xff,0x02,0x00,0x05]
+0xff,0xff,0x48,0xda,0xff,0x02,0x00,0x05
+
+# GFX90A: ds_dec_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x48,0xda,0x01,0xff,0x00,0x05]
+0xff,0xff,0x48,0xda,0x01,0xff,0x00,0x05
+
+# GFX90A: ds_dec_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x48,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x48,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_dec_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x48,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x48,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_dec_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x48,0xda,0x01,0x02,0x00,0x05]
+0x04,0x00,0x48,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_dec_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x49,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x49,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_min_rtn_i32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4a,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x4a,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_min_rtn_i32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4a,0xda,0x01,0x02,0x00,0xff]
+0xff,0xff,0x4a,0xda,0x01,0x02,0x00,0xff
+
+# GFX90A: ds_min_rtn_i32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x4a,0xda,0xff,0x02,0x00,0x05]
+0xff,0xff,0x4a,0xda,0xff,0x02,0x00,0x05
+
+# GFX90A: ds_min_rtn_i32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x4a,0xda,0x01,0xff,0x00,0x05]
+0xff,0xff,0x4a,0xda,0x01,0xff,0x00,0x05
+
+# GFX90A: ds_min_rtn_i32 a5, v1, a2       ; encoding: [0x00,0x00,0x4a,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x4a,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_min_rtn_i32 a5, v1, a2       ; encoding: [0x00,0x00,0x4a,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x4a,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_min_rtn_i32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x4a,0xda,0x01,0x02,0x00,0x05]
+0x04,0x00,0x4a,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_min_rtn_i32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x4b,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x4b,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_max_rtn_i32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4c,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x4c,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_max_rtn_i32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4c,0xda,0x01,0x02,0x00,0xff]
+0xff,0xff,0x4c,0xda,0x01,0x02,0x00,0xff
+
+# GFX90A: ds_max_rtn_i32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x4c,0xda,0xff,0x02,0x00,0x05]
+0xff,0xff,0x4c,0xda,0xff,0x02,0x00,0x05
+
+# GFX90A: ds_max_rtn_i32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x4c,0xda,0x01,0xff,0x00,0x05]
+0xff,0xff,0x4c,0xda,0x01,0xff,0x00,0x05
+
+# GFX90A: ds_max_rtn_i32 a5, v1, a2       ; encoding: [0x00,0x00,0x4c,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x4c,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_max_rtn_i32 a5, v1, a2       ; encoding: [0x00,0x00,0x4c,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x4c,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_max_rtn_i32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x4c,0xda,0x01,0x02,0x00,0x05]
+0x04,0x00,0x4c,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_max_rtn_i32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x4d,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x4d,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_min_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4e,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x4e,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_min_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4e,0xda,0x01,0x02,0x00,0xff]
+0xff,0xff,0x4e,0xda,0x01,0x02,0x00,0xff
+
+# GFX90A: ds_min_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x4e,0xda,0xff,0x02,0x00,0x05]
+0xff,0xff,0x4e,0xda,0xff,0x02,0x00,0x05
+
+# GFX90A: ds_min_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x4e,0xda,0x01,0xff,0x00,0x05]
+0xff,0xff,0x4e,0xda,0x01,0xff,0x00,0x05
+
+# GFX90A: ds_min_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x4e,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x4e,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_min_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x4e,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x4e,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_min_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x4e,0xda,0x01,0x02,0x00,0x05]
+0x04,0x00,0x4e,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_min_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x4f,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x4f,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_max_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x50,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x50,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_max_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x50,0xda,0x01,0x02,0x00,0xff]
+0xff,0xff,0x50,0xda,0x01,0x02,0x00,0xff
+
+# GFX90A: ds_max_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x50,0xda,0xff,0x02,0x00,0x05]
+0xff,0xff,0x50,0xda,0xff,0x02,0x00,0x05
+
+# GFX90A: ds_max_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x50,0xda,0x01,0xff,0x00,0x05]
+0xff,0xff,0x50,0xda,0x01,0xff,0x00,0x05
+
+# GFX90A: ds_max_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x50,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x50,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_max_rtn_u32 a5, v1, a2       ; encoding: [0x00,0x00,0x50,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x50,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_max_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x50,0xda,0x01,0x02,0x00,0x05]
+0x04,0x00,0x50,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_max_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x51,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x51,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_and_rtn_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x52,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x52,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_and_rtn_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x52,0xda,0x01,0x02,0x00,0xff]
+0xff,0xff,0x52,0xda,0x01,0x02,0x00,0xff
+
+# GFX90A: ds_and_rtn_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x52,0xda,0xff,0x02,0x00,0x05]
+0xff,0xff,0x52,0xda,0xff,0x02,0x00,0x05
+
+# GFX90A: ds_and_rtn_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x52,0xda,0x01,0xff,0x00,0x05]
+0xff,0xff,0x52,0xda,0x01,0xff,0x00,0x05
+
+# GFX90A: ds_and_rtn_b32 a5, v1, a2       ; encoding: [0x00,0x00,0x52,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x52,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_and_rtn_b32 a5, v1, a2       ; encoding: [0x00,0x00,0x52,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x52,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_and_rtn_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x52,0xda,0x01,0x02,0x00,0x05]
+0x04,0x00,0x52,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_and_rtn_b32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x53,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x53,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_or_rtn_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x54,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x54,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_or_rtn_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x54,0xda,0x01,0x02,0x00,0xff]
+0xff,0xff,0x54,0xda,0x01,0x02,0x00,0xff
+
+# GFX90A: ds_or_rtn_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x54,0xda,0xff,0x02,0x00,0x05]
+0xff,0xff,0x54,0xda,0xff,0x02,0x00,0x05
+
+# GFX90A: ds_or_rtn_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x54,0xda,0x01,0xff,0x00,0x05]
+0xff,0xff,0x54,0xda,0x01,0xff,0x00,0x05
+
+# GFX90A: ds_or_rtn_b32 a5, v1, a2        ; encoding: [0x00,0x00,0x54,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x54,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_or_rtn_b32 a5, v1, a2        ; encoding: [0x00,0x00,0x54,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x54,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_or_rtn_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x54,0xda,0x01,0x02,0x00,0x05]
+0x04,0x00,0x54,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_or_rtn_b32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x55,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x55,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_xor_rtn_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x56,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x56,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_xor_rtn_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x56,0xda,0x01,0x02,0x00,0xff]
+0xff,0xff,0x56,0xda,0x01,0x02,0x00,0xff
+
+# GFX90A: ds_xor_rtn_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x56,0xda,0xff,0x02,0x00,0x05]
+0xff,0xff,0x56,0xda,0xff,0x02,0x00,0x05
+
+# GFX90A: ds_xor_rtn_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x56,0xda,0x01,0xff,0x00,0x05]
+0xff,0xff,0x56,0xda,0x01,0xff,0x00,0x05
+
+# GFX90A: ds_xor_rtn_b32 a5, v1, a2       ; encoding: [0x00,0x00,0x56,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x56,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_xor_rtn_b32 a5, v1, a2       ; encoding: [0x00,0x00,0x56,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x56,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_xor_rtn_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x56,0xda,0x01,0x02,0x00,0x05]
+0x04,0x00,0x56,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_xor_rtn_b32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x57,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x57,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x58,0xda,0x01,0x02,0x05,0x05]
+0xff,0xff,0x58,0xda,0x01,0x02,0x05,0x05
+
+# GFX90A: ds_mskor_rtn_b32 a255, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x58,0xda,0x01,0x02,0x05,0xff]
+0xff,0xff,0x58,0xda,0x01,0x02,0x05,0xff
+
+# GFX90A: ds_mskor_rtn_b32 a5, v255, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x58,0xda,0xff,0x02,0x05,0x05]
+0xff,0xff,0x58,0xda,0xff,0x02,0x05,0x05
+
+# GFX90A: ds_mskor_rtn_b32 a5, v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x58,0xda,0x01,0xff,0x03,0x05]
+0xff,0xff,0x58,0xda,0x01,0xff,0x03,0x05
+
+# GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x58,0xda,0x01,0x02,0x05,0x05]
+0xff,0xff,0x58,0xda,0x01,0x02,0x05,0x05
+
+# GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 ; encoding: [0x00,0x00,0x58,0xda,0x01,0x02,0x05,0x05]
+0x00,0x00,0x58,0xda,0x01,0x02,0x05,0x05
+
+# GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 ; encoding: [0x00,0x00,0x58,0xda,0x01,0x02,0x05,0x05]
+0x00,0x00,0x58,0xda,0x01,0x02,0x05,0x05
+
+# GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 offset:4 ; encoding: [0x04,0x00,0x58,0xda,0x01,0x02,0x05,0x05]
+0x04,0x00,0x58,0xda,0x01,0x02,0x05,0x05
+
+# GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 offset:65535 gds ; encoding: [0xff,0xff,0x59,0xda,0x01,0x02,0x05,0x05]
+0xff,0xff,0x59,0xda,0x01,0x02,0x05,0x05
+
+# GFX90A: ds_wrxchg_rtn_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x5a,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x5a,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_wrxchg_rtn_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x5a,0xda,0x01,0x02,0x00,0xff]
+0xff,0xff,0x5a,0xda,0x01,0x02,0x00,0xff
+
+# GFX90A: ds_wrxchg_rtn_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x5a,0xda,0xff,0x02,0x00,0x05]
+0xff,0xff,0x5a,0xda,0xff,0x02,0x00,0x05
+
+# GFX90A: ds_wrxchg_rtn_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x5a,0xda,0x01,0xff,0x00,0x05]
+0xff,0xff,0x5a,0xda,0x01,0xff,0x00,0x05
+
+# GFX90A: ds_wrxchg_rtn_b32 a5, v1, a2    ; encoding: [0x00,0x00,0x5a,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x5a,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_wrxchg_rtn_b32 a5, v1, a2    ; encoding: [0x00,0x00,0x5a,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x5a,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_wrxchg_rtn_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x5a,0xda,0x01,0x02,0x00,0x05]
+0x04,0x00,0x5a,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_wrxchg_rtn_b32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x5b,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x5b,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5c,0xda,0x01,0x02,0x03,0x06]
+0x7f,0xff,0x5c,0xda,0x01,0x02,0x03,0x06
+
+# GFX90A: ds_wrxchg2_rtn_b32 a[254:255], v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5c,0xda,0x01,0x02,0x03,0xfe]
+0x7f,0xff,0x5c,0xda,0x01,0x02,0x03,0xfe
+
+# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v255, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5c,0xda,0xff,0x02,0x03,0x06]
+0x7f,0xff,0x5c,0xda,0xff,0x02,0x03,0x06
+
+# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a255, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5c,0xda,0x01,0xff,0x03,0x06]
+0x7f,0xff,0x5c,0xda,0x01,0xff,0x03,0x06
+
+# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5c,0xda,0x01,0x02,0xff,0x06]
+0x7f,0xff,0x5c,0xda,0x01,0x02,0xff,0x06
+
+# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x5c,0xda,0x01,0x02,0x03,0x06]
+0x00,0xff,0x5c,0xda,0x01,0x02,0x03,0x06
+
+# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x5c,0xda,0x01,0x02,0x03,0x06]
+0x00,0xff,0x5c,0xda,0x01,0x02,0x03,0x06
+
+# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x5c,0xda,0x01,0x02,0x03,0x06]
+0x10,0xff,0x5c,0xda,0x01,0x02,0x03,0x06
+
+# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x5c,0xda,0x01,0x02,0x03,0x06]
+0x7f,0x00,0x5c,0xda,0x01,0x02,0x03,0x06
+
+# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x5c,0xda,0x01,0x02,0x03,0x06]
+0x7f,0x00,0x5c,0xda,0x01,0x02,0x03,0x06
+
+# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x5c,0xda,0x01,0x02,0x03,0x06]
+0x7f,0x01,0x5c,0xda,0x01,0x02,0x03,0x06
+
+# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x5d,0xda,0x01,0x02,0x03,0x06]
+0x7f,0xff,0x5d,0xda,0x01,0x02,0x03,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5e,0xda,0x01,0x02,0x03,0x06]
+0x7f,0xff,0x5e,0xda,0x01,0x02,0x03,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b32 a[254:255], v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5e,0xda,0x01,0x02,0x03,0xfe]
+0x7f,0xff,0x5e,0xda,0x01,0x02,0x03,0xfe
+
+# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v255, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5e,0xda,0xff,0x02,0x03,0x06]
+0x7f,0xff,0x5e,0xda,0xff,0x02,0x03,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a255, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5e,0xda,0x01,0xff,0x03,0x06]
+0x7f,0xff,0x5e,0xda,0x01,0xff,0x03,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5e,0xda,0x01,0x02,0xff,0x06]
+0x7f,0xff,0x5e,0xda,0x01,0x02,0xff,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x5e,0xda,0x01,0x02,0x03,0x06]
+0x00,0xff,0x5e,0xda,0x01,0x02,0x03,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x5e,0xda,0x01,0x02,0x03,0x06]
+0x00,0xff,0x5e,0xda,0x01,0x02,0x03,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x5e,0xda,0x01,0x02,0x03,0x06]
+0x10,0xff,0x5e,0xda,0x01,0x02,0x03,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x5e,0xda,0x01,0x02,0x03,0x06]
+0x7f,0x00,0x5e,0xda,0x01,0x02,0x03,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x5e,0xda,0x01,0x02,0x03,0x06]
+0x7f,0x00,0x5e,0xda,0x01,0x02,0x03,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x5e,0xda,0x01,0x02,0x03,0x06]
+0x7f,0x01,0x5e,0xda,0x01,0x02,0x03,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x5f,0xda,0x01,0x02,0x03,0x06]
+0x7f,0xff,0x5f,0xda,0x01,0x02,0x03,0x06
+
+# GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0x01,0x02,0x03,0x05]
+0xff,0xff,0x60,0xda,0x01,0x02,0x03,0x05
+
+# GFX90A: ds_cmpst_rtn_b32 a255, v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0x01,0x02,0x03,0xff]
+0xff,0xff,0x60,0xda,0x01,0x02,0x03,0xff
+
+# GFX90A: ds_cmpst_rtn_b32 a5, v255, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0xff,0x02,0x03,0x05]
+0xff,0xff,0x60,0xda,0xff,0x02,0x03,0x05
+
+# GFX90A: ds_cmpst_rtn_b32 a5, v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0x01,0xff,0x03,0x05]
+0xff,0xff,0x60,0xda,0x01,0xff,0x03,0x05
+
+# GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a255 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0x01,0x02,0xff,0x05]
+0xff,0xff,0x60,0xda,0x01,0x02,0xff,0x05
+
+# GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a3 ; encoding: [0x00,0x00,0x60,0xda,0x01,0x02,0x03,0x05]
+0x00,0x00,0x60,0xda,0x01,0x02,0x03,0x05
+
+# GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a3 ; encoding: [0x00,0x00,0x60,0xda,0x01,0x02,0x03,0x05]
+0x00,0x00,0x60,0xda,0x01,0x02,0x03,0x05
+
+# GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a3 offset:4 ; encoding: [0x04,0x00,0x60,0xda,0x01,0x02,0x03,0x05]
+0x04,0x00,0x60,0xda,0x01,0x02,0x03,0x05
+
+# GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a3 offset:65535 gds ; encoding: [0xff,0xff,0x61,0xda,0x01,0x02,0x03,0x05]
+0xff,0xff,0x61,0xda,0x01,0x02,0x03,0x05
+
+# GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x62,0xda,0x01,0x02,0x03,0x05]
+0xff,0xff,0x62,0xda,0x01,0x02,0x03,0x05
+
+# GFX90A: ds_cmpst_rtn_f32 a255, v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x62,0xda,0x01,0x02,0x03,0xff]
+0xff,0xff,0x62,0xda,0x01,0x02,0x03,0xff
+
+# GFX90A: ds_cmpst_rtn_f32 a5, v255, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x62,0xda,0xff,0x02,0x03,0x05]
+0xff,0xff,0x62,0xda,0xff,0x02,0x03,0x05
+
+# GFX90A: ds_cmpst_rtn_f32 a5, v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x62,0xda,0x01,0xff,0x03,0x05]
+0xff,0xff,0x62,0xda,0x01,0xff,0x03,0x05
+
+# GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a255 offset:65535 ; encoding: [0xff,0xff,0x62,0xda,0x01,0x02,0xff,0x05]
+0xff,0xff,0x62,0xda,0x01,0x02,0xff,0x05
+
+# GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a3 ; encoding: [0x00,0x00,0x62,0xda,0x01,0x02,0x03,0x05]
+0x00,0x00,0x62,0xda,0x01,0x02,0x03,0x05
+
+# GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a3 ; encoding: [0x00,0x00,0x62,0xda,0x01,0x02,0x03,0x05]
+0x00,0x00,0x62,0xda,0x01,0x02,0x03,0x05
+
+# GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a3 offset:4 ; encoding: [0x04,0x00,0x62,0xda,0x01,0x02,0x03,0x05]
+0x04,0x00,0x62,0xda,0x01,0x02,0x03,0x05
+
+# GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a3 offset:65535 gds ; encoding: [0xff,0xff,0x63,0xda,0x01,0x02,0x03,0x05]
+0xff,0xff,0x63,0xda,0x01,0x02,0x03,0x05
+
+# GFX90A: ds_min_rtn_f32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x64,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x64,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_min_rtn_f32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x64,0xda,0x01,0x02,0x00,0xff]
+0xff,0xff,0x64,0xda,0x01,0x02,0x00,0xff
+
+# GFX90A: ds_min_rtn_f32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x64,0xda,0xff,0x02,0x00,0x05]
+0xff,0xff,0x64,0xda,0xff,0x02,0x00,0x05
+
+# GFX90A: ds_min_rtn_f32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x64,0xda,0x01,0xff,0x00,0x05]
+0xff,0xff,0x64,0xda,0x01,0xff,0x00,0x05
+
+# GFX90A: ds_min_rtn_f32 a5, v1, a2       ; encoding: [0x00,0x00,0x64,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x64,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_min_rtn_f32 a5, v1, a2       ; encoding: [0x00,0x00,0x64,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x64,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_min_rtn_f32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x64,0xda,0x01,0x02,0x00,0x05]
+0x04,0x00,0x64,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_min_rtn_f32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x65,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x65,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_max_rtn_f32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x66,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x66,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_max_rtn_f32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x66,0xda,0x01,0x02,0x00,0xff]
+0xff,0xff,0x66,0xda,0x01,0x02,0x00,0xff
+
+# GFX90A: ds_max_rtn_f32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x66,0xda,0xff,0x02,0x00,0x05]
+0xff,0xff,0x66,0xda,0xff,0x02,0x00,0x05
+
+# GFX90A: ds_max_rtn_f32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x66,0xda,0x01,0xff,0x00,0x05]
+0xff,0xff,0x66,0xda,0x01,0xff,0x00,0x05
+
+# GFX90A: ds_max_rtn_f32 a5, v1, a2       ; encoding: [0x00,0x00,0x66,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x66,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_max_rtn_f32 a5, v1, a2       ; encoding: [0x00,0x00,0x66,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x66,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_max_rtn_f32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x66,0xda,0x01,0x02,0x00,0x05]
+0x04,0x00,0x66,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_max_rtn_f32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x67,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x67,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0x01,0x02,0x05,0x05]
+0xff,0xff,0x68,0xda,0x01,0x02,0x05,0x05
+
+# GFX90A: ds_wrap_rtn_b32 a255, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0x01,0x02,0x05,0xff]
+0xff,0xff,0x68,0xda,0x01,0x02,0x05,0xff
+
+# GFX90A: ds_wrap_rtn_b32 a5, v255, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0xff,0x02,0x05,0x05]
+0xff,0xff,0x68,0xda,0xff,0x02,0x05,0x05
+
+# GFX90A: ds_wrap_rtn_b32 a5, v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0x01,0xff,0x03,0x05]
+0xff,0xff,0x68,0xda,0x01,0xff,0x03,0x05
+
+# GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0x01,0x02,0x05,0x05]
+0xff,0xff,0x68,0xda,0x01,0x02,0x05,0x05
+
+# GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5  ; encoding: [0x00,0x00,0x68,0xda,0x01,0x02,0x05,0x05]
+0x00,0x00,0x68,0xda,0x01,0x02,0x05,0x05
+
+# GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5  ; encoding: [0x00,0x00,0x68,0xda,0x01,0x02,0x05,0x05]
+0x00,0x00,0x68,0xda,0x01,0x02,0x05,0x05
+
+# GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5 offset:4 ; encoding: [0x04,0x00,0x68,0xda,0x01,0x02,0x05,0x05]
+0x04,0x00,0x68,0xda,0x01,0x02,0x05,0x05
+
+# GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5 offset:65535 gds ; encoding: [0xff,0xff,0x69,0xda,0x01,0x02,0x05,0x05]
+0xff,0xff,0x69,0xda,0x01,0x02,0x05,0x05
+
+# GFX90A: ds_add_rtn_f32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x6a,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x6a,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_add_rtn_f32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x6a,0xda,0x01,0x02,0x00,0xff]
+0xff,0xff,0x6a,0xda,0x01,0x02,0x00,0xff
+
+# GFX90A: ds_add_rtn_f32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x6a,0xda,0xff,0x02,0x00,0x05]
+0xff,0xff,0x6a,0xda,0xff,0x02,0x00,0x05
+
+# GFX90A: ds_add_rtn_f32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x6a,0xda,0x01,0xff,0x00,0x05]
+0xff,0xff,0x6a,0xda,0x01,0xff,0x00,0x05
+
+# GFX90A: ds_add_rtn_f32 a5, v1, a2       ; encoding: [0x00,0x00,0x6a,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x6a,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_add_rtn_f32 a5, v1, a2       ; encoding: [0x00,0x00,0x6a,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x6a,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_add_rtn_f32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x6a,0xda,0x01,0x02,0x00,0x05]
+0x04,0x00,0x6a,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_add_rtn_f32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x6b,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x6b,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_read_b32 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x6c,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0x6c,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_b32 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x6c,0xda,0x01,0x00,0x00,0xff]
+0xff,0xff,0x6c,0xda,0x01,0x00,0x00,0xff
+
+# GFX90A: ds_read_b32 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x6c,0xda,0xff,0x00,0x00,0x05]
+0xff,0xff,0x6c,0xda,0xff,0x00,0x00,0x05
+
+# GFX90A: ds_read_b32 a5, v1              ; encoding: [0x00,0x00,0x6c,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0x6c,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_b32 a5, v1              ; encoding: [0x00,0x00,0x6c,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0x6c,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_b32 a5, v1 offset:4     ; encoding: [0x04,0x00,0x6c,0xda,0x01,0x00,0x00,0x05]
+0x04,0x00,0x6c,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_b32 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x6d,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0x6d,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read2_b32 a[6:7], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x6e,0xda,0x01,0x00,0x00,0x06]
+0x7f,0xff,0x6e,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2_b32 a[254:255], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x6e,0xda,0x01,0x00,0x00,0xfe]
+0x7f,0xff,0x6e,0xda,0x01,0x00,0x00,0xfe
+
+# GFX90A: ds_read2_b32 a[6:7], v255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x6e,0xda,0xff,0x00,0x00,0x06]
+0x7f,0xff,0x6e,0xda,0xff,0x00,0x00,0x06
+
+# GFX90A: ds_read2_b32 a[6:7], v1 offset1:255 ; encoding: [0x00,0xff,0x6e,0xda,0x01,0x00,0x00,0x06]
+0x00,0xff,0x6e,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2_b32 a[6:7], v1 offset1:255 ; encoding: [0x00,0xff,0x6e,0xda,0x01,0x00,0x00,0x06]
+0x00,0xff,0x6e,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2_b32 a[6:7], v1 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x6e,0xda,0x01,0x00,0x00,0x06]
+0x10,0xff,0x6e,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2_b32 a[6:7], v1 offset0:127 ; encoding: [0x7f,0x00,0x6e,0xda,0x01,0x00,0x00,0x06]
+0x7f,0x00,0x6e,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2_b32 a[6:7], v1 offset0:127 ; encoding: [0x7f,0x00,0x6e,0xda,0x01,0x00,0x00,0x06]
+0x7f,0x00,0x6e,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2_b32 a[6:7], v1 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x6e,0xda,0x01,0x00,0x00,0x06]
+0x7f,0x01,0x6e,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2_b32 a[6:7], v1 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x6f,0xda,0x01,0x00,0x00,0x06]
+0x7f,0xff,0x6f,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x70,0xda,0x01,0x00,0x00,0x06]
+0x7f,0xff,0x70,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2st64_b32 a[254:255], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x70,0xda,0x01,0x00,0x00,0xfe]
+0x7f,0xff,0x70,0xda,0x01,0x00,0x00,0xfe
+
+# GFX90A: ds_read2st64_b32 a[6:7], v255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x70,0xda,0xff,0x00,0x00,0x06]
+0x7f,0xff,0x70,0xda,0xff,0x00,0x00,0x06
+
+# GFX90A: ds_read2st64_b32 a[6:7], v1 offset1:255 ; encoding: [0x00,0xff,0x70,0xda,0x01,0x00,0x00,0x06]
+0x00,0xff,0x70,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2st64_b32 a[6:7], v1 offset1:255 ; encoding: [0x00,0xff,0x70,0xda,0x01,0x00,0x00,0x06]
+0x00,0xff,0x70,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x70,0xda,0x01,0x00,0x00,0x06]
+0x10,0xff,0x70,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:127 ; encoding: [0x7f,0x00,0x70,0xda,0x01,0x00,0x00,0x06]
+0x7f,0x00,0x70,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:127 ; encoding: [0x7f,0x00,0x70,0xda,0x01,0x00,0x00,0x06]
+0x7f,0x00,0x70,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x70,0xda,0x01,0x00,0x00,0x06]
+0x7f,0x01,0x70,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x71,0xda,0x01,0x00,0x00,0x06]
+0x7f,0xff,0x71,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read_i8 a5, v1 offset:65535  ; encoding: [0xff,0xff,0x72,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0x72,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_i8 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x72,0xda,0x01,0x00,0x00,0xff]
+0xff,0xff,0x72,0xda,0x01,0x00,0x00,0xff
+
+# GFX90A: ds_read_i8 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x72,0xda,0xff,0x00,0x00,0x05]
+0xff,0xff,0x72,0xda,0xff,0x00,0x00,0x05
+
+# GFX90A: ds_read_i8 a5, v1               ; encoding: [0x00,0x00,0x72,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0x72,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_i8 a5, v1               ; encoding: [0x00,0x00,0x72,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0x72,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_i8 a5, v1 offset:4      ; encoding: [0x04,0x00,0x72,0xda,0x01,0x00,0x00,0x05]
+0x04,0x00,0x72,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_i8 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x73,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0x73,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u8 a5, v1 offset:65535  ; encoding: [0xff,0xff,0x74,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0x74,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u8 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x74,0xda,0x01,0x00,0x00,0xff]
+0xff,0xff,0x74,0xda,0x01,0x00,0x00,0xff
+
+# GFX90A: ds_read_u8 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x74,0xda,0xff,0x00,0x00,0x05]
+0xff,0xff,0x74,0xda,0xff,0x00,0x00,0x05
+
+# GFX90A: ds_read_u8 a5, v1               ; encoding: [0x00,0x00,0x74,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0x74,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u8 a5, v1               ; encoding: [0x00,0x00,0x74,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0x74,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u8 a5, v1 offset:4      ; encoding: [0x04,0x00,0x74,0xda,0x01,0x00,0x00,0x05]
+0x04,0x00,0x74,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u8 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x75,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0x75,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_i16 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x76,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0x76,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_i16 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x76,0xda,0x01,0x00,0x00,0xff]
+0xff,0xff,0x76,0xda,0x01,0x00,0x00,0xff
+
+# GFX90A: ds_read_i16 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x76,0xda,0xff,0x00,0x00,0x05]
+0xff,0xff,0x76,0xda,0xff,0x00,0x00,0x05
+
+# GFX90A: ds_read_i16 a5, v1              ; encoding: [0x00,0x00,0x76,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0x76,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_i16 a5, v1              ; encoding: [0x00,0x00,0x76,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0x76,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_i16 a5, v1 offset:4     ; encoding: [0x04,0x00,0x76,0xda,0x01,0x00,0x00,0x05]
+0x04,0x00,0x76,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_i16 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x77,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0x77,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u16 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x78,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0x78,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u16 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x78,0xda,0x01,0x00,0x00,0xff]
+0xff,0xff,0x78,0xda,0x01,0x00,0x00,0xff
+
+# GFX90A: ds_read_u16 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x78,0xda,0xff,0x00,0x00,0x05]
+0xff,0xff,0x78,0xda,0xff,0x00,0x00,0x05
+
+# GFX90A: ds_read_u16 a5, v1              ; encoding: [0x00,0x00,0x78,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0x78,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u16 a5, v1              ; encoding: [0x00,0x00,0x78,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0x78,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u16 a5, v1 offset:4     ; encoding: [0x04,0x00,0x78,0xda,0x01,0x00,0x00,0x05]
+0x04,0x00,0x78,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u16 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x79,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0x79,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_swizzle_b32 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_swizzle_b32 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0xff]
+0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0xff
+
+# GFX90A: ds_swizzle_b32 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0xff,0x00,0x00,0x05]
+0xff,0xff,0x7a,0xda,0xff,0x00,0x00,0x05
+
+# GFX90A: ds_swizzle_b32 a5, v1           ; encoding: [0x00,0x00,0x7a,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0x7a,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_swizzle_b32 a5, v1           ; encoding: [0x00,0x00,0x7a,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0x7a,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_swizzle_b32 a5, v1 offset:swizzle(BITMASK_PERM,"00p00") ; encoding: [0x04,0x00,0x7a,0xda,0x01,0x00,0x00,0x05]
+0x04,0x00,0x7a,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_swizzle_b32 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x7b,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0x7b,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_permute_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x7c,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_permute_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0x01,0x02,0x00,0xff]
+0xff,0xff,0x7c,0xda,0x01,0x02,0x00,0xff
+
+# GFX90A: ds_permute_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0xff,0x02,0x00,0x05]
+0xff,0xff,0x7c,0xda,0xff,0x02,0x00,0x05
+
+# GFX90A: ds_permute_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0x01,0xff,0x00,0x05]
+0xff,0xff,0x7c,0xda,0x01,0xff,0x00,0x05
+
+# GFX90A: ds_permute_b32 a5, v1, a2       ; encoding: [0x00,0x00,0x7c,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x7c,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_permute_b32 a5, v1, a2       ; encoding: [0x00,0x00,0x7c,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x7c,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_permute_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x7c,0xda,0x01,0x02,0x00,0x05]
+0x04,0x00,0x7c,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_bpermute_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0x01,0x02,0x00,0x05]
+0xff,0xff,0x7e,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_bpermute_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0x01,0x02,0x00,0xff]
+0xff,0xff,0x7e,0xda,0x01,0x02,0x00,0xff
+
+# GFX90A: ds_bpermute_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0xff,0x02,0x00,0x05]
+0xff,0xff,0x7e,0xda,0xff,0x02,0x00,0x05
+
+# GFX90A: ds_bpermute_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0x01,0xff,0x00,0x05]
+0xff,0xff,0x7e,0xda,0x01,0xff,0x00,0x05
+
+# GFX90A: ds_bpermute_b32 a5, v1, a2      ; encoding: [0x00,0x00,0x7e,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x7e,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_bpermute_b32 a5, v1, a2      ; encoding: [0x00,0x00,0x7e,0xda,0x01,0x02,0x00,0x05]
+0x00,0x00,0x7e,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_bpermute_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x7e,0xda,0x01,0x02,0x00,0x05]
+0x04,0x00,0x7e,0xda,0x01,0x02,0x00,0x05
+
+# GFX90A: ds_add_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x80,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x80,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_add_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x80,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x80,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_add_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x80,0xda,0x01,0xfe,0x00,0x00]
+0xff,0xff,0x80,0xda,0x01,0xfe,0x00,0x00
+
+# GFX90A: ds_add_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x80,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x80,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_add_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x80,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x80,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_add_u64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0x80,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x80,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_add_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x81,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x81,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_sub_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x82,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x82,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_sub_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x82,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x82,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_sub_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x82,0xda,0x01,0xfe,0x00,0x00]
+0xff,0xff,0x82,0xda,0x01,0xfe,0x00,0x00
+
+# GFX90A: ds_sub_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x82,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x82,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_sub_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x82,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x82,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_sub_u64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0x82,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x82,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_sub_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x83,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x83,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_rsub_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x84,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x84,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_rsub_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x84,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x84,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_rsub_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x84,0xda,0x01,0xfe,0x00,0x00]
+0xff,0xff,0x84,0xda,0x01,0xfe,0x00,0x00
+
+# GFX90A: ds_rsub_u64 v1, a[2:3]          ; encoding: [0x00,0x00,0x84,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x84,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_rsub_u64 v1, a[2:3]          ; encoding: [0x00,0x00,0x84,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x84,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_rsub_u64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x84,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x84,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_rsub_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x85,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x85,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_inc_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x86,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x86,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_inc_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x86,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x86,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_inc_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x86,0xda,0x01,0xfe,0x00,0x00]
+0xff,0xff,0x86,0xda,0x01,0xfe,0x00,0x00
+
+# GFX90A: ds_inc_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x86,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x86,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_inc_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x86,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x86,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_inc_u64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0x86,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x86,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_inc_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x87,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x87,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_dec_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x88,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x88,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_dec_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x88,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x88,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_dec_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x88,0xda,0x01,0xfe,0x00,0x00]
+0xff,0xff,0x88,0xda,0x01,0xfe,0x00,0x00
+
+# GFX90A: ds_dec_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x88,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x88,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_dec_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x88,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x88,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_dec_u64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0x88,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x88,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_dec_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x89,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x89,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_i64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8a,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x8a,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_i64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8a,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x8a,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_min_i64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x8a,0xda,0x01,0xfe,0x00,0x00]
+0xff,0xff,0x8a,0xda,0x01,0xfe,0x00,0x00
+
+# GFX90A: ds_min_i64 v1, a[2:3]           ; encoding: [0x00,0x00,0x8a,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x8a,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_i64 v1, a[2:3]           ; encoding: [0x00,0x00,0x8a,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x8a,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_i64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0x8a,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x8a,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_i64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x8b,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x8b,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_i64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8c,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x8c,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_i64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8c,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x8c,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_max_i64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x8c,0xda,0x01,0xfe,0x00,0x00]
+0xff,0xff,0x8c,0xda,0x01,0xfe,0x00,0x00
+
+# GFX90A: ds_max_i64 v1, a[2:3]           ; encoding: [0x00,0x00,0x8c,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x8c,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_i64 v1, a[2:3]           ; encoding: [0x00,0x00,0x8c,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x8c,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_i64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0x8c,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x8c,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_i64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x8d,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x8d,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8e,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x8e,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8e,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x8e,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_min_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x8e,0xda,0x01,0xfe,0x00,0x00]
+0xff,0xff,0x8e,0xda,0x01,0xfe,0x00,0x00
+
+# GFX90A: ds_min_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x8e,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x8e,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x8e,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x8e,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_u64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0x8e,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x8e,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x8f,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x8f,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x90,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x90,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x90,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x90,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_max_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x90,0xda,0x01,0xfe,0x00,0x00]
+0xff,0xff,0x90,0xda,0x01,0xfe,0x00,0x00
+
+# GFX90A: ds_max_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x90,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x90,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_u64 v1, a[2:3]           ; encoding: [0x00,0x00,0x90,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x90,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_u64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0x90,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x90,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x91,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x91,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_and_b64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x92,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x92,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_and_b64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x92,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x92,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_and_b64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x92,0xda,0x01,0xfe,0x00,0x00]
+0xff,0xff,0x92,0xda,0x01,0xfe,0x00,0x00
+
+# GFX90A: ds_and_b64 v1, a[2:3]           ; encoding: [0x00,0x00,0x92,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x92,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_and_b64 v1, a[2:3]           ; encoding: [0x00,0x00,0x92,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x92,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_and_b64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0x92,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x92,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_and_b64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x93,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x93,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_or_b64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x94,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x94,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_or_b64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x94,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x94,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_or_b64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x94,0xda,0x01,0xfe,0x00,0x00]
+0xff,0xff,0x94,0xda,0x01,0xfe,0x00,0x00
+
+# GFX90A: ds_or_b64 v1, a[2:3]            ; encoding: [0x00,0x00,0x94,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x94,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_or_b64 v1, a[2:3]            ; encoding: [0x00,0x00,0x94,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x94,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_or_b64 v1, a[2:3] offset:4   ; encoding: [0x04,0x00,0x94,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x94,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_or_b64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x95,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x95,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_xor_b64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x96,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x96,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_xor_b64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x96,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x96,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_xor_b64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x96,0xda,0x01,0xfe,0x00,0x00]
+0xff,0xff,0x96,0xda,0x01,0xfe,0x00,0x00
+
+# GFX90A: ds_xor_b64 v1, a[2:3]           ; encoding: [0x00,0x00,0x96,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x96,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_xor_b64 v1, a[2:3]           ; encoding: [0x00,0x00,0x96,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x96,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_xor_b64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0x96,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x96,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_xor_b64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x97,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x97,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_mskor_b64 v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0x98,0xda,0x01,0x02,0x04,0x00]
+0xff,0xff,0x98,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_mskor_b64 v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0x98,0xda,0xff,0x02,0x04,0x00]
+0xff,0xff,0x98,0xda,0xff,0x02,0x04,0x00
+
+# GFX90A: ds_mskor_b64 v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0x98,0xda,0x01,0xfe,0x04,0x00]
+0xff,0xff,0x98,0xda,0x01,0xfe,0x04,0x00
+
+# GFX90A: ds_mskor_b64 v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0x98,0xda,0x01,0x02,0xfe,0x00]
+0xff,0xff,0x98,0xda,0x01,0x02,0xfe,0x00
+
+# GFX90A: ds_mskor_b64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0x98,0xda,0x01,0x02,0x04,0x00]
+0x00,0x00,0x98,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_mskor_b64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0x98,0xda,0x01,0x02,0x04,0x00]
+0x00,0x00,0x98,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_mskor_b64 v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0x98,0xda,0x01,0x02,0x04,0x00]
+0x04,0x00,0x98,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_mskor_b64 v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0x99,0xda,0x01,0x02,0x04,0x00]
+0xff,0xff,0x99,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_write_b64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x9a,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x9a,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x9a,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0x9a,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_write_b64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x9a,0xda,0x01,0xfe,0x00,0x00]
+0xff,0xff,0x9a,0xda,0x01,0xfe,0x00,0x00
+
+# GFX90A: ds_write_b64 v1, a[2:3]         ; encoding: [0x00,0x00,0x9a,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x9a,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b64 v1, a[2:3]         ; encoding: [0x00,0x00,0x9a,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0x9a,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x9a,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0x9a,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x9b,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0x9b,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9c,0xda,0x01,0x02,0x04,0x00]
+0x7f,0xff,0x9c,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_write2_b64 v255, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9c,0xda,0xff,0x02,0x04,0x00]
+0x7f,0xff,0x9c,0xda,0xff,0x02,0x04,0x00
+
+# GFX90A: ds_write2_b64 v1, a[254:255], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9c,0xda,0x01,0xfe,0x04,0x00]
+0x7f,0xff,0x9c,0xda,0x01,0xfe,0x04,0x00
+
+# GFX90A: ds_write2_b64 v1, a[2:3], a[254:255] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9c,0xda,0x01,0x02,0xfe,0x00]
+0x7f,0xff,0x9c,0xda,0x01,0x02,0xfe,0x00
+
+# GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0x9c,0xda,0x01,0x02,0x04,0x00]
+0x00,0xff,0x9c,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0x9c,0xda,0x01,0x02,0x04,0x00]
+0x00,0xff,0x9c,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:16 offset1:255 ; encoding: [0x10,0xff,0x9c,0xda,0x01,0x02,0x04,0x00]
+0x10,0xff,0x9c,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0x9c,0xda,0x01,0x02,0x04,0x00]
+0x7f,0x00,0x9c,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0x9c,0xda,0x01,0x02,0x04,0x00]
+0x7f,0x00,0x9c,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x9c,0xda,0x01,0x02,0x04,0x00]
+0x7f,0x01,0x9c,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x9d,0xda,0x01,0x02,0x04,0x00]
+0x7f,0xff,0x9d,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9e,0xda,0x01,0x02,0x04,0x00]
+0x7f,0xff,0x9e,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_write2st64_b64 v255, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9e,0xda,0xff,0x02,0x04,0x00]
+0x7f,0xff,0x9e,0xda,0xff,0x02,0x04,0x00
+
+# GFX90A: ds_write2st64_b64 v1, a[254:255], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9e,0xda,0x01,0xfe,0x04,0x00]
+0x7f,0xff,0x9e,0xda,0x01,0xfe,0x04,0x00
+
+# GFX90A: ds_write2st64_b64 v1, a[2:3], a[254:255] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9e,0xda,0x01,0x02,0xfe,0x00]
+0x7f,0xff,0x9e,0xda,0x01,0x02,0xfe,0x00
+
+# GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0x9e,0xda,0x01,0x02,0x04,0x00]
+0x00,0xff,0x9e,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0x9e,0xda,0x01,0x02,0x04,0x00]
+0x00,0xff,0x9e,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:16 offset1:255 ; encoding: [0x10,0xff,0x9e,0xda,0x01,0x02,0x04,0x00]
+0x10,0xff,0x9e,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0x9e,0xda,0x01,0x02,0x04,0x00]
+0x7f,0x00,0x9e,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0x9e,0xda,0x01,0x02,0x04,0x00]
+0x7f,0x00,0x9e,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x9e,0xda,0x01,0x02,0x04,0x00]
+0x7f,0x01,0x9e,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x9f,0xda,0x01,0x02,0x04,0x00]
+0x7f,0xff,0x9f,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_cmpst_b64 v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0x01,0x02,0x04,0x00]
+0xff,0xff,0xa0,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_cmpst_b64 v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0xff,0x02,0x04,0x00]
+0xff,0xff,0xa0,0xda,0xff,0x02,0x04,0x00
+
+# GFX90A: ds_cmpst_b64 v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0x01,0xfe,0x04,0x00]
+0xff,0xff,0xa0,0xda,0x01,0xfe,0x04,0x00
+
+# GFX90A: ds_cmpst_b64 v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0x01,0x02,0xfe,0x00]
+0xff,0xff,0xa0,0xda,0x01,0x02,0xfe,0x00
+
+# GFX90A: ds_cmpst_b64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xa0,0xda,0x01,0x02,0x04,0x00]
+0x00,0x00,0xa0,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_cmpst_b64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xa0,0xda,0x01,0x02,0x04,0x00]
+0x00,0x00,0xa0,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_cmpst_b64 v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0xa0,0xda,0x01,0x02,0x04,0x00]
+0x04,0x00,0xa0,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_cmpst_b64 v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0xa1,0xda,0x01,0x02,0x04,0x00]
+0xff,0xff,0xa1,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_cmpst_f64 v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa2,0xda,0x01,0x02,0x04,0x00]
+0xff,0xff,0xa2,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_cmpst_f64 v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa2,0xda,0xff,0x02,0x04,0x00]
+0xff,0xff,0xa2,0xda,0xff,0x02,0x04,0x00
+
+# GFX90A: ds_cmpst_f64 v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa2,0xda,0x01,0xfe,0x04,0x00]
+0xff,0xff,0xa2,0xda,0x01,0xfe,0x04,0x00
+
+# GFX90A: ds_cmpst_f64 v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0xa2,0xda,0x01,0x02,0xfe,0x00]
+0xff,0xff,0xa2,0xda,0x01,0x02,0xfe,0x00
+
+# GFX90A: ds_cmpst_f64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xa2,0xda,0x01,0x02,0x04,0x00]
+0x00,0x00,0xa2,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_cmpst_f64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xa2,0xda,0x01,0x02,0x04,0x00]
+0x00,0x00,0xa2,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_cmpst_f64 v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0xa2,0xda,0x01,0x02,0x04,0x00]
+0x04,0x00,0xa2,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_cmpst_f64 v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0xa3,0xda,0x01,0x02,0x04,0x00]
+0xff,0xff,0xa3,0xda,0x01,0x02,0x04,0x00
+
+# GFX90A: ds_min_f64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xa4,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0xa4,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_f64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xa4,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0xa4,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_min_f64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xa4,0xda,0x01,0xfe,0x00,0x00]
+0xff,0xff,0xa4,0xda,0x01,0xfe,0x00,0x00
+
+# GFX90A: ds_min_f64 v1, a[2:3]           ; encoding: [0x00,0x00,0xa4,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0xa4,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_f64 v1, a[2:3]           ; encoding: [0x00,0x00,0xa4,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0xa4,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_f64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0xa4,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0xa4,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_min_f64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xa5,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0xa5,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_f64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xa6,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0xa6,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_f64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xa6,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0xa6,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_max_f64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xa6,0xda,0x01,0xfe,0x00,0x00]
+0xff,0xff,0xa6,0xda,0x01,0xfe,0x00,0x00
+
+# GFX90A: ds_max_f64 v1, a[2:3]           ; encoding: [0x00,0x00,0xa6,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0xa6,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_f64 v1, a[2:3]           ; encoding: [0x00,0x00,0xa6,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0xa6,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_f64 v1, a[2:3] offset:4  ; encoding: [0x04,0x00,0xa6,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0xa6,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_max_f64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xa7,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0xa7,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b8_d16_hi v1, a2 offset:65535 ; encoding: [0xff,0xff,0xa8,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0xa8,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b8_d16_hi v255, a2 offset:65535 ; encoding: [0xff,0xff,0xa8,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0xa8,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_write_b8_d16_hi v1, a255 offset:65535 ; encoding: [0xff,0xff,0xa8,0xda,0x01,0xff,0x00,0x00]
+0xff,0xff,0xa8,0xda,0x01,0xff,0x00,0x00
+
+# GFX90A: ds_write_b8_d16_hi v1, a2       ; encoding: [0x00,0x00,0xa8,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0xa8,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b8_d16_hi v1, a2       ; encoding: [0x00,0x00,0xa8,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0xa8,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b8_d16_hi v1, a2 offset:4 ; encoding: [0x04,0x00,0xa8,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0xa8,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b8_d16_hi v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0xa9,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0xa9,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b16_d16_hi v1, a2 offset:65535 ; encoding: [0xff,0xff,0xaa,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0xaa,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b16_d16_hi v255, a2 offset:65535 ; encoding: [0xff,0xff,0xaa,0xda,0xff,0x02,0x00,0x00]
+0xff,0xff,0xaa,0xda,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_write_b16_d16_hi v1, a255 offset:65535 ; encoding: [0xff,0xff,0xaa,0xda,0x01,0xff,0x00,0x00]
+0xff,0xff,0xaa,0xda,0x01,0xff,0x00,0x00
+
+# GFX90A: ds_write_b16_d16_hi v1, a2      ; encoding: [0x00,0x00,0xaa,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0xaa,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b16_d16_hi v1, a2      ; encoding: [0x00,0x00,0xaa,0xda,0x01,0x02,0x00,0x00]
+0x00,0x00,0xaa,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b16_d16_hi v1, a2 offset:4 ; encoding: [0x04,0x00,0xaa,0xda,0x01,0x02,0x00,0x00]
+0x04,0x00,0xaa,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b16_d16_hi v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0xab,0xda,0x01,0x02,0x00,0x00]
+0xff,0xff,0xab,0xda,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_read_u8_d16 a5, v1 offset:65535 ; encoding: [0xff,0xff,0xac,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0xac,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u8_d16 a255, v1 offset:65535 ; encoding: [0xff,0xff,0xac,0xda,0x01,0x00,0x00,0xff]
+0xff,0xff,0xac,0xda,0x01,0x00,0x00,0xff
+
+# GFX90A: ds_read_u8_d16 a5, v255 offset:65535 ; encoding: [0xff,0xff,0xac,0xda,0xff,0x00,0x00,0x05]
+0xff,0xff,0xac,0xda,0xff,0x00,0x00,0x05
+
+# GFX90A: ds_read_u8_d16 a5, v1           ; encoding: [0x00,0x00,0xac,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0xac,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u8_d16 a5, v1           ; encoding: [0x00,0x00,0xac,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0xac,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u8_d16 a5, v1 offset:4  ; encoding: [0x04,0x00,0xac,0xda,0x01,0x00,0x00,0x05]
+0x04,0x00,0xac,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u8_d16 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xad,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0xad,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u8_d16_hi a5, v1 offset:65535 ; encoding: [0xff,0xff,0xae,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0xae,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u8_d16_hi a255, v1 offset:65535 ; encoding: [0xff,0xff,0xae,0xda,0x01,0x00,0x00,0xff]
+0xff,0xff,0xae,0xda,0x01,0x00,0x00,0xff
+
+# GFX90A: ds_read_u8_d16_hi a5, v255 offset:65535 ; encoding: [0xff,0xff,0xae,0xda,0xff,0x00,0x00,0x05]
+0xff,0xff,0xae,0xda,0xff,0x00,0x00,0x05
+
+# GFX90A: ds_read_u8_d16_hi a5, v1        ; encoding: [0x00,0x00,0xae,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0xae,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u8_d16_hi a5, v1        ; encoding: [0x00,0x00,0xae,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0xae,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u8_d16_hi a5, v1 offset:4 ; encoding: [0x04,0x00,0xae,0xda,0x01,0x00,0x00,0x05]
+0x04,0x00,0xae,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u8_d16_hi a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xaf,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0xaf,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_i8_d16 a5, v1 offset:65535 ; encoding: [0xff,0xff,0xb0,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0xb0,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_i8_d16 a255, v1 offset:65535 ; encoding: [0xff,0xff,0xb0,0xda,0x01,0x00,0x00,0xff]
+0xff,0xff,0xb0,0xda,0x01,0x00,0x00,0xff
+
+# GFX90A: ds_read_i8_d16 a5, v255 offset:65535 ; encoding: [0xff,0xff,0xb0,0xda,0xff,0x00,0x00,0x05]
+0xff,0xff,0xb0,0xda,0xff,0x00,0x00,0x05
+
+# GFX90A: ds_read_i8_d16 a5, v1           ; encoding: [0x00,0x00,0xb0,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0xb0,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_i8_d16 a5, v1           ; encoding: [0x00,0x00,0xb0,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0xb0,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_i8_d16 a5, v1 offset:4  ; encoding: [0x04,0x00,0xb0,0xda,0x01,0x00,0x00,0x05]
+0x04,0x00,0xb0,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_i8_d16 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xb1,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0xb1,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_i8_d16_hi a5, v1 offset:65535 ; encoding: [0xff,0xff,0xb2,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0xb2,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_i8_d16_hi a255, v1 offset:65535 ; encoding: [0xff,0xff,0xb2,0xda,0x01,0x00,0x00,0xff]
+0xff,0xff,0xb2,0xda,0x01,0x00,0x00,0xff
+
+# GFX90A: ds_read_i8_d16_hi a5, v255 offset:65535 ; encoding: [0xff,0xff,0xb2,0xda,0xff,0x00,0x00,0x05]
+0xff,0xff,0xb2,0xda,0xff,0x00,0x00,0x05
+
+# GFX90A: ds_read_i8_d16_hi a5, v1        ; encoding: [0x00,0x00,0xb2,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0xb2,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_i8_d16_hi a5, v1        ; encoding: [0x00,0x00,0xb2,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0xb2,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_i8_d16_hi a5, v1 offset:4 ; encoding: [0x04,0x00,0xb2,0xda,0x01,0x00,0x00,0x05]
+0x04,0x00,0xb2,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_i8_d16_hi a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xb3,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0xb3,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u16_d16 a5, v1 offset:65535 ; encoding: [0xff,0xff,0xb4,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0xb4,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u16_d16 a255, v1 offset:65535 ; encoding: [0xff,0xff,0xb4,0xda,0x01,0x00,0x00,0xff]
+0xff,0xff,0xb4,0xda,0x01,0x00,0x00,0xff
+
+# GFX90A: ds_read_u16_d16 a5, v255 offset:65535 ; encoding: [0xff,0xff,0xb4,0xda,0xff,0x00,0x00,0x05]
+0xff,0xff,0xb4,0xda,0xff,0x00,0x00,0x05
+
+# GFX90A: ds_read_u16_d16 a5, v1          ; encoding: [0x00,0x00,0xb4,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0xb4,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u16_d16 a5, v1          ; encoding: [0x00,0x00,0xb4,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0xb4,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u16_d16 a5, v1 offset:4 ; encoding: [0x04,0x00,0xb4,0xda,0x01,0x00,0x00,0x05]
+0x04,0x00,0xb4,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u16_d16 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xb5,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0xb5,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u16_d16_hi a5, v1 offset:65535 ; encoding: [0xff,0xff,0xb6,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0xb6,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u16_d16_hi a255, v1 offset:65535 ; encoding: [0xff,0xff,0xb6,0xda,0x01,0x00,0x00,0xff]
+0xff,0xff,0xb6,0xda,0x01,0x00,0x00,0xff
+
+# GFX90A: ds_read_u16_d16_hi a5, v255 offset:65535 ; encoding: [0xff,0xff,0xb6,0xda,0xff,0x00,0x00,0x05]
+0xff,0xff,0xb6,0xda,0xff,0x00,0x00,0x05
+
+# GFX90A: ds_read_u16_d16_hi a5, v1       ; encoding: [0x00,0x00,0xb6,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0xb6,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u16_d16_hi a5, v1       ; encoding: [0x00,0x00,0xb6,0xda,0x01,0x00,0x00,0x05]
+0x00,0x00,0xb6,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u16_d16_hi a5, v1 offset:4 ; encoding: [0x04,0x00,0xb6,0xda,0x01,0x00,0x00,0x05]
+0x04,0x00,0xb6,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_read_u16_d16_hi a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xb7,0xda,0x01,0x00,0x00,0x05]
+0xff,0xff,0xb7,0xda,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_add_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc0,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xc0,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_add_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc0,0xda,0x01,0x02,0x00,0xfe]
+0xff,0xff,0xc0,0xda,0x01,0x02,0x00,0xfe
+
+# GFX90A: ds_add_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc0,0xda,0xff,0x02,0x00,0x06]
+0xff,0xff,0xc0,0xda,0xff,0x02,0x00,0x06
+
+# GFX90A: ds_add_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xc0,0xda,0x01,0xfe,0x00,0x06]
+0xff,0xff,0xc0,0xda,0x01,0xfe,0x00,0x06
+
+# GFX90A: ds_add_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc0,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xc0,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_add_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc0,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xc0,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_add_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xc0,0xda,0x01,0x02,0x00,0x06]
+0x04,0x00,0xc0,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_add_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xc1,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xc1,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc2,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xc2,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_sub_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc2,0xda,0x01,0x02,0x00,0xfe]
+0xff,0xff,0xc2,0xda,0x01,0x02,0x00,0xfe
+
+# GFX90A: ds_sub_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc2,0xda,0xff,0x02,0x00,0x06]
+0xff,0xff,0xc2,0xda,0xff,0x02,0x00,0x06
+
+# GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xc2,0xda,0x01,0xfe,0x00,0x06]
+0xff,0xff,0xc2,0xda,0x01,0xfe,0x00,0x06
+
+# GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc2,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xc2,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc2,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xc2,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xc2,0xda,0x01,0x02,0x00,0x06]
+0x04,0x00,0xc2,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xc3,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xc3,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc4,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xc4,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_rsub_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc4,0xda,0x01,0x02,0x00,0xfe]
+0xff,0xff,0xc4,0xda,0x01,0x02,0x00,0xfe
+
+# GFX90A: ds_rsub_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc4,0xda,0xff,0x02,0x00,0x06]
+0xff,0xff,0xc4,0xda,0xff,0x02,0x00,0x06
+
+# GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xc4,0xda,0x01,0xfe,0x00,0x06]
+0xff,0xff,0xc4,0xda,0x01,0xfe,0x00,0x06
+
+# GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc4,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xc4,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc4,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xc4,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xc4,0xda,0x01,0x02,0x00,0x06]
+0x04,0x00,0xc4,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xc5,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xc5,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc6,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xc6,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_inc_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc6,0xda,0x01,0x02,0x00,0xfe]
+0xff,0xff,0xc6,0xda,0x01,0x02,0x00,0xfe
+
+# GFX90A: ds_inc_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc6,0xda,0xff,0x02,0x00,0x06]
+0xff,0xff,0xc6,0xda,0xff,0x02,0x00,0x06
+
+# GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xc6,0xda,0x01,0xfe,0x00,0x06]
+0xff,0xff,0xc6,0xda,0x01,0xfe,0x00,0x06
+
+# GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc6,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xc6,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc6,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xc6,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xc6,0xda,0x01,0x02,0x00,0x06]
+0x04,0x00,0xc6,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xc7,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xc7,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc8,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xc8,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_dec_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc8,0xda,0x01,0x02,0x00,0xfe]
+0xff,0xff,0xc8,0xda,0x01,0x02,0x00,0xfe
+
+# GFX90A: ds_dec_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc8,0xda,0xff,0x02,0x00,0x06]
+0xff,0xff,0xc8,0xda,0xff,0x02,0x00,0x06
+
+# GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xc8,0xda,0x01,0xfe,0x00,0x06]
+0xff,0xff,0xc8,0xda,0x01,0xfe,0x00,0x06
+
+# GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc8,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xc8,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc8,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xc8,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xc8,0xda,0x01,0x02,0x00,0x06]
+0x04,0x00,0xc8,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xc9,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xc9,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_min_rtn_i64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xca,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xca,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_min_rtn_i64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xca,0xda,0x01,0x02,0x00,0xfe]
+0xff,0xff,0xca,0xda,0x01,0x02,0x00,0xfe
+
+# GFX90A: ds_min_rtn_i64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xca,0xda,0xff,0x02,0x00,0x06]
+0xff,0xff,0xca,0xda,0xff,0x02,0x00,0x06
+
+# GFX90A: ds_min_rtn_i64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xca,0xda,0x01,0xfe,0x00,0x06]
+0xff,0xff,0xca,0xda,0x01,0xfe,0x00,0x06
+
+# GFX90A: ds_min_rtn_i64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xca,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xca,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_min_rtn_i64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xca,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xca,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_min_rtn_i64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xca,0xda,0x01,0x02,0x00,0x06]
+0x04,0x00,0xca,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_min_rtn_i64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xcb,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xcb,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_max_rtn_i64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xcc,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xcc,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_max_rtn_i64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xcc,0xda,0x01,0x02,0x00,0xfe]
+0xff,0xff,0xcc,0xda,0x01,0x02,0x00,0xfe
+
+# GFX90A: ds_max_rtn_i64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xcc,0xda,0xff,0x02,0x00,0x06]
+0xff,0xff,0xcc,0xda,0xff,0x02,0x00,0x06
+
+# GFX90A: ds_max_rtn_i64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xcc,0xda,0x01,0xfe,0x00,0x06]
+0xff,0xff,0xcc,0xda,0x01,0xfe,0x00,0x06
+
+# GFX90A: ds_max_rtn_i64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xcc,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xcc,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_max_rtn_i64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xcc,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xcc,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_max_rtn_i64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xcc,0xda,0x01,0x02,0x00,0x06]
+0x04,0x00,0xcc,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_max_rtn_i64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xcd,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xcd,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_min_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xce,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xce,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_min_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xce,0xda,0x01,0x02,0x00,0xfe]
+0xff,0xff,0xce,0xda,0x01,0x02,0x00,0xfe
+
+# GFX90A: ds_min_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xce,0xda,0xff,0x02,0x00,0x06]
+0xff,0xff,0xce,0xda,0xff,0x02,0x00,0x06
+
+# GFX90A: ds_min_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xce,0xda,0x01,0xfe,0x00,0x06]
+0xff,0xff,0xce,0xda,0x01,0xfe,0x00,0x06
+
+# GFX90A: ds_min_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xce,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xce,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_min_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xce,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xce,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_min_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xce,0xda,0x01,0x02,0x00,0x06]
+0x04,0x00,0xce,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_min_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xcf,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xcf,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_max_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd0,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xd0,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_max_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd0,0xda,0x01,0x02,0x00,0xfe]
+0xff,0xff,0xd0,0xda,0x01,0x02,0x00,0xfe
+
+# GFX90A: ds_max_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd0,0xda,0xff,0x02,0x00,0x06]
+0xff,0xff,0xd0,0xda,0xff,0x02,0x00,0x06
+
+# GFX90A: ds_max_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xd0,0xda,0x01,0xfe,0x00,0x06]
+0xff,0xff,0xd0,0xda,0x01,0xfe,0x00,0x06
+
+# GFX90A: ds_max_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd0,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xd0,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_max_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd0,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xd0,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_max_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xd0,0xda,0x01,0x02,0x00,0x06]
+0x04,0x00,0xd0,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_max_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xd1,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xd1,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_and_rtn_b64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd2,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xd2,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_and_rtn_b64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd2,0xda,0x01,0x02,0x00,0xfe]
+0xff,0xff,0xd2,0xda,0x01,0x02,0x00,0xfe
+
+# GFX90A: ds_and_rtn_b64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd2,0xda,0xff,0x02,0x00,0x06]
+0xff,0xff,0xd2,0xda,0xff,0x02,0x00,0x06
+
+# GFX90A: ds_and_rtn_b64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xd2,0xda,0x01,0xfe,0x00,0x06]
+0xff,0xff,0xd2,0xda,0x01,0xfe,0x00,0x06
+
+# GFX90A: ds_and_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd2,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xd2,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_and_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd2,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xd2,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_and_rtn_b64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xd2,0xda,0x01,0x02,0x00,0x06]
+0x04,0x00,0xd2,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_and_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xd3,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xd3,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_or_rtn_b64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd4,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xd4,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_or_rtn_b64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd4,0xda,0x01,0x02,0x00,0xfe]
+0xff,0xff,0xd4,0xda,0x01,0x02,0x00,0xfe
+
+# GFX90A: ds_or_rtn_b64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd4,0xda,0xff,0x02,0x00,0x06]
+0xff,0xff,0xd4,0xda,0xff,0x02,0x00,0x06
+
+# GFX90A: ds_or_rtn_b64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xd4,0xda,0x01,0xfe,0x00,0x06]
+0xff,0xff,0xd4,0xda,0x01,0xfe,0x00,0x06
+
+# GFX90A: ds_or_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd4,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xd4,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_or_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd4,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xd4,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_or_rtn_b64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xd4,0xda,0x01,0x02,0x00,0x06]
+0x04,0x00,0xd4,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_or_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xd5,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xd5,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd6,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xd6,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_xor_rtn_b64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd6,0xda,0x01,0x02,0x00,0xfe]
+0xff,0xff,0xd6,0xda,0x01,0x02,0x00,0xfe
+
+# GFX90A: ds_xor_rtn_b64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd6,0xda,0xff,0x02,0x00,0x06]
+0xff,0xff,0xd6,0xda,0xff,0x02,0x00,0x06
+
+# GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xd6,0xda,0x01,0xfe,0x00,0x06]
+0xff,0xff,0xd6,0xda,0x01,0xfe,0x00,0x06
+
+# GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd6,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xd6,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd6,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xd6,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xd6,0xda,0x01,0x02,0x00,0x06]
+0x04,0x00,0xd6,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xd7,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xd7,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xd8,0xda,0x01,0x02,0x04,0x06]
+0xff,0xff,0xd8,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_mskor_rtn_b64 a[254:255], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xd8,0xda,0x01,0x02,0x04,0xfe]
+0xff,0xff,0xd8,0xda,0x01,0x02,0x04,0xfe
+
+# GFX90A: ds_mskor_rtn_b64 a[6:7], v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xd8,0xda,0xff,0x02,0x04,0x06]
+0xff,0xff,0xd8,0xda,0xff,0x02,0x04,0x06
+
+# GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xd8,0xda,0x01,0xfe,0x04,0x06]
+0xff,0xff,0xd8,0xda,0x01,0xfe,0x04,0x06
+
+# GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0xd8,0xda,0x01,0x02,0xfe,0x06]
+0xff,0xff,0xd8,0xda,0x01,0x02,0xfe,0x06
+
+# GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xd8,0xda,0x01,0x02,0x04,0x06]
+0x00,0x00,0xd8,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xd8,0xda,0x01,0x02,0x04,0x06]
+0x00,0x00,0xd8,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0xd8,0xda,0x01,0x02,0x04,0x06]
+0x04,0x00,0xd8,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0xd9,0xda,0x01,0x02,0x04,0x06]
+0xff,0xff,0xd9,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xda,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xda,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_wrxchg_rtn_b64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xda,0xda,0x01,0x02,0x00,0xfe]
+0xff,0xff,0xda,0xda,0x01,0x02,0x00,0xfe
+
+# GFX90A: ds_wrxchg_rtn_b64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xda,0xda,0xff,0x02,0x00,0x06]
+0xff,0xff,0xda,0xda,0xff,0x02,0x00,0x06
+
+# GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xda,0xda,0x01,0xfe,0x00,0x06]
+0xff,0xff,0xda,0xda,0x01,0xfe,0x00,0x06
+
+# GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xda,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xda,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xda,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xda,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xda,0xda,0x01,0x02,0x00,0x06]
+0x04,0x00,0xda,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xdb,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xdb,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xda,0x01,0x02,0x04,0x06]
+0x7f,0xff,0xdc,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_wrxchg2_rtn_b64 a[252:255], v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xda,0x01,0x02,0x04,0xfc]
+0x7f,0xff,0xdc,0xda,0x01,0x02,0x04,0xfc
+
+# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v255, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xda,0xff,0x02,0x04,0x06]
+0x7f,0xff,0xdc,0xda,0xff,0x02,0x04,0x06
+
+# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[254:255], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xda,0x01,0xfe,0x04,0x06]
+0x7f,0xff,0xdc,0xda,0x01,0xfe,0x04,0x06
+
+# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[254:255] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xda,0x01,0x02,0xfe,0x06]
+0x7f,0xff,0xdc,0xda,0x01,0x02,0xfe,0x06
+
+# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0xdc,0xda,0x01,0x02,0x04,0x06]
+0x00,0xff,0xdc,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0xdc,0xda,0x01,0x02,0x04,0x06]
+0x00,0xff,0xdc,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:16 offset1:255 ; encoding: [0x10,0xff,0xdc,0xda,0x01,0x02,0x04,0x06]
+0x10,0xff,0xdc,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0xdc,0xda,0x01,0x02,0x04,0x06]
+0x7f,0x00,0xdc,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0xdc,0xda,0x01,0x02,0x04,0x06]
+0x7f,0x00,0xdc,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:1 ; encoding: [0x7f,0x01,0xdc,0xda,0x01,0x02,0x04,0x06]
+0x7f,0x01,0xdc,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0xdd,0xda,0x01,0x02,0x04,0x06]
+0x7f,0xff,0xdd,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xde,0xda,0x01,0x02,0x04,0x06]
+0x7f,0xff,0xde,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b64 a[252:255], v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xde,0xda,0x01,0x02,0x04,0xfc]
+0x7f,0xff,0xde,0xda,0x01,0x02,0x04,0xfc
+
+# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v255, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xde,0xda,0xff,0x02,0x04,0x06]
+0x7f,0xff,0xde,0xda,0xff,0x02,0x04,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[254:255], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xde,0xda,0x01,0xfe,0x04,0x06]
+0x7f,0xff,0xde,0xda,0x01,0xfe,0x04,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[254:255] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xde,0xda,0x01,0x02,0xfe,0x06]
+0x7f,0xff,0xde,0xda,0x01,0x02,0xfe,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0xde,0xda,0x01,0x02,0x04,0x06]
+0x00,0xff,0xde,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0xde,0xda,0x01,0x02,0x04,0x06]
+0x00,0xff,0xde,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:16 offset1:255 ; encoding: [0x10,0xff,0xde,0xda,0x01,0x02,0x04,0x06]
+0x10,0xff,0xde,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0xde,0xda,0x01,0x02,0x04,0x06]
+0x7f,0x00,0xde,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0xde,0xda,0x01,0x02,0x04,0x06]
+0x7f,0x00,0xde,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:1 ; encoding: [0x7f,0x01,0xde,0xda,0x01,0x02,0x04,0x06]
+0x7f,0x01,0xde,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0xdf,0xda,0x01,0x02,0x04,0x06]
+0x7f,0xff,0xdf,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe0,0xda,0x01,0x02,0x04,0x06]
+0xff,0xff,0xe0,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_cmpst_rtn_b64 a[254:255], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe0,0xda,0x01,0x02,0x04,0xfe]
+0xff,0xff,0xe0,0xda,0x01,0x02,0x04,0xfe
+
+# GFX90A: ds_cmpst_rtn_b64 a[6:7], v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe0,0xda,0xff,0x02,0x04,0x06]
+0xff,0xff,0xe0,0xda,0xff,0x02,0x04,0x06
+
+# GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe0,0xda,0x01,0xfe,0x04,0x06]
+0xff,0xff,0xe0,0xda,0x01,0xfe,0x04,0x06
+
+# GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0xe0,0xda,0x01,0x02,0xfe,0x06]
+0xff,0xff,0xe0,0xda,0x01,0x02,0xfe,0x06
+
+# GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xe0,0xda,0x01,0x02,0x04,0x06]
+0x00,0x00,0xe0,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xe0,0xda,0x01,0x02,0x04,0x06]
+0x00,0x00,0xe0,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0xe0,0xda,0x01,0x02,0x04,0x06]
+0x04,0x00,0xe0,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0xe1,0xda,0x01,0x02,0x04,0x06]
+0xff,0xff,0xe1,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe2,0xda,0x01,0x02,0x04,0x06]
+0xff,0xff,0xe2,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_cmpst_rtn_f64 a[254:255], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe2,0xda,0x01,0x02,0x04,0xfe]
+0xff,0xff,0xe2,0xda,0x01,0x02,0x04,0xfe
+
+# GFX90A: ds_cmpst_rtn_f64 a[6:7], v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe2,0xda,0xff,0x02,0x04,0x06]
+0xff,0xff,0xe2,0xda,0xff,0x02,0x04,0x06
+
+# GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe2,0xda,0x01,0xfe,0x04,0x06]
+0xff,0xff,0xe2,0xda,0x01,0xfe,0x04,0x06
+
+# GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0xe2,0xda,0x01,0x02,0xfe,0x06]
+0xff,0xff,0xe2,0xda,0x01,0x02,0xfe,0x06
+
+# GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xe2,0xda,0x01,0x02,0x04,0x06]
+0x00,0x00,0xe2,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xe2,0xda,0x01,0x02,0x04,0x06]
+0x00,0x00,0xe2,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0xe2,0xda,0x01,0x02,0x04,0x06]
+0x04,0x00,0xe2,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0xe3,0xda,0x01,0x02,0x04,0x06]
+0xff,0xff,0xe3,0xda,0x01,0x02,0x04,0x06
+
+# GFX90A: ds_min_rtn_f64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe4,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xe4,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_min_rtn_f64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe4,0xda,0x01,0x02,0x00,0xfe]
+0xff,0xff,0xe4,0xda,0x01,0x02,0x00,0xfe
+
+# GFX90A: ds_min_rtn_f64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe4,0xda,0xff,0x02,0x00,0x06]
+0xff,0xff,0xe4,0xda,0xff,0x02,0x00,0x06
+
+# GFX90A: ds_min_rtn_f64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xe4,0xda,0x01,0xfe,0x00,0x06]
+0xff,0xff,0xe4,0xda,0x01,0xfe,0x00,0x06
+
+# GFX90A: ds_min_rtn_f64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xe4,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xe4,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_min_rtn_f64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xe4,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xe4,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_min_rtn_f64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xe4,0xda,0x01,0x02,0x00,0x06]
+0x04,0x00,0xe4,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_min_rtn_f64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xe5,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xe5,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_max_rtn_f64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe6,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xe6,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_max_rtn_f64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe6,0xda,0x01,0x02,0x00,0xfe]
+0xff,0xff,0xe6,0xda,0x01,0x02,0x00,0xfe
+
+# GFX90A: ds_max_rtn_f64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe6,0xda,0xff,0x02,0x00,0x06]
+0xff,0xff,0xe6,0xda,0xff,0x02,0x00,0x06
+
+# GFX90A: ds_max_rtn_f64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xe6,0xda,0x01,0xfe,0x00,0x06]
+0xff,0xff,0xe6,0xda,0x01,0xfe,0x00,0x06
+
+# GFX90A: ds_max_rtn_f64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xe6,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xe6,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_max_rtn_f64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xe6,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xe6,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_max_rtn_f64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xe6,0xda,0x01,0x02,0x00,0x06]
+0x04,0x00,0xe6,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_max_rtn_f64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xe7,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xe7,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_read_b64 a[6:7], v1 offset:65535 ; encoding: [0xff,0xff,0xec,0xda,0x01,0x00,0x00,0x06]
+0xff,0xff,0xec,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read_b64 a[254:255], v1 offset:65535 ; encoding: [0xff,0xff,0xec,0xda,0x01,0x00,0x00,0xfe]
+0xff,0xff,0xec,0xda,0x01,0x00,0x00,0xfe
+
+# GFX90A: ds_read_b64 a[6:7], v255 offset:65535 ; encoding: [0xff,0xff,0xec,0xda,0xff,0x00,0x00,0x06]
+0xff,0xff,0xec,0xda,0xff,0x00,0x00,0x06
+
+# GFX90A: ds_read_b64 a[6:7], v1          ; encoding: [0x00,0x00,0xec,0xda,0x01,0x00,0x00,0x06]
+0x00,0x00,0xec,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read_b64 a[6:7], v1          ; encoding: [0x00,0x00,0xec,0xda,0x01,0x00,0x00,0x06]
+0x00,0x00,0xec,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read_b64 a[6:7], v1 offset:4 ; encoding: [0x04,0x00,0xec,0xda,0x01,0x00,0x00,0x06]
+0x04,0x00,0xec,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read_b64 a[6:7], v1 offset:65535 gds ; encoding: [0xff,0xff,0xed,0xda,0x01,0x00,0x00,0x06]
+0xff,0xff,0xed,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2_b64 a[6:9], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xee,0xda,0x01,0x00,0x00,0x06]
+0x7f,0xff,0xee,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2_b64 a[252:255], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xee,0xda,0x01,0x00,0x00,0xfc]
+0x7f,0xff,0xee,0xda,0x01,0x00,0x00,0xfc
+
+# GFX90A: ds_read2_b64 a[6:9], v255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xee,0xda,0xff,0x00,0x00,0x06]
+0x7f,0xff,0xee,0xda,0xff,0x00,0x00,0x06
+
+# GFX90A: ds_read2_b64 a[6:9], v1 offset1:255 ; encoding: [0x00,0xff,0xee,0xda,0x01,0x00,0x00,0x06]
+0x00,0xff,0xee,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2_b64 a[6:9], v1 offset1:255 ; encoding: [0x00,0xff,0xee,0xda,0x01,0x00,0x00,0x06]
+0x00,0xff,0xee,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2_b64 a[6:9], v1 offset0:16 offset1:255 ; encoding: [0x10,0xff,0xee,0xda,0x01,0x00,0x00,0x06]
+0x10,0xff,0xee,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2_b64 a[6:9], v1 offset0:127 ; encoding: [0x7f,0x00,0xee,0xda,0x01,0x00,0x00,0x06]
+0x7f,0x00,0xee,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2_b64 a[6:9], v1 offset0:127 ; encoding: [0x7f,0x00,0xee,0xda,0x01,0x00,0x00,0x06]
+0x7f,0x00,0xee,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2_b64 a[6:9], v1 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0xee,0xda,0x01,0x00,0x00,0x06]
+0x7f,0x01,0xee,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2_b64 a[6:9], v1 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0xef,0xda,0x01,0x00,0x00,0x06]
+0x7f,0xff,0xef,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xf0,0xda,0x01,0x00,0x00,0x06]
+0x7f,0xff,0xf0,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2st64_b64 a[252:255], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xf0,0xda,0x01,0x00,0x00,0xfc]
+0x7f,0xff,0xf0,0xda,0x01,0x00,0x00,0xfc
+
+# GFX90A: ds_read2st64_b64 a[6:9], v255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xf0,0xda,0xff,0x00,0x00,0x06]
+0x7f,0xff,0xf0,0xda,0xff,0x00,0x00,0x06
+
+# GFX90A: ds_read2st64_b64 a[6:9], v1 offset1:255 ; encoding: [0x00,0xff,0xf0,0xda,0x01,0x00,0x00,0x06]
+0x00,0xff,0xf0,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2st64_b64 a[6:9], v1 offset1:255 ; encoding: [0x00,0xff,0xf0,0xda,0x01,0x00,0x00,0x06]
+0x00,0xff,0xf0,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:16 offset1:255 ; encoding: [0x10,0xff,0xf0,0xda,0x01,0x00,0x00,0x06]
+0x10,0xff,0xf0,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:127 ; encoding: [0x7f,0x00,0xf0,0xda,0x01,0x00,0x00,0x06]
+0x7f,0x00,0xf0,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:127 ; encoding: [0x7f,0x00,0xf0,0xda,0x01,0x00,0x00,0x06]
+0x7f,0x00,0xf0,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0xf0,0xda,0x01,0x00,0x00,0x06]
+0x7f,0x01,0xf0,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0xf1,0xda,0x01,0x00,0x00,0x06]
+0x7f,0xff,0xf1,0xda,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xfc,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xfc,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_condxchg32_rtn_b64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xfc,0xda,0x01,0x02,0x00,0xfe]
+0xff,0xff,0xfc,0xda,0x01,0x02,0x00,0xfe
+
+# GFX90A: ds_condxchg32_rtn_b64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xfc,0xda,0xff,0x02,0x00,0x06]
+0xff,0xff,0xfc,0xda,0xff,0x02,0x00,0x06
+
+# GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xfc,0xda,0x01,0xfe,0x00,0x06]
+0xff,0xff,0xfc,0xda,0x01,0xfe,0x00,0x06
+
+# GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xfc,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xfc,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xfc,0xda,0x01,0x02,0x00,0x06]
+0x00,0x00,0xfc,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xfc,0xda,0x01,0x02,0x00,0x06]
+0x04,0x00,0xfc,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xfd,0xda,0x01,0x02,0x00,0x06]
+0xff,0xff,0xfd,0xda,0x01,0x02,0x00,0x06
+
+# GFX90A: ds_gws_init a1 offset:65535 gds ; encoding: [0xff,0xff,0x33,0xdb,0x01,0x00,0x00,0x00]
+0xff,0xff,0x33,0xdb,0x01,0x00,0x00,0x00
+
+# GFX90A: ds_gws_init a255 offset:65535 gds ; encoding: [0xff,0xff,0x33,0xdb,0xff,0x00,0x00,0x00]
+0xff,0xff,0x33,0xdb,0xff,0x00,0x00,0x00
+
+# GFX90A: ds_gws_init a1 gds              ; encoding: [0x00,0x00,0x33,0xdb,0x01,0x00,0x00,0x00]
+0x00,0x00,0x33,0xdb,0x01,0x00,0x00,0x00
+
+# GFX90A: ds_gws_init a1 gds              ; encoding: [0x00,0x00,0x33,0xdb,0x01,0x00,0x00,0x00]
+0x00,0x00,0x33,0xdb,0x01,0x00,0x00,0x00
+
+# GFX90A: ds_gws_init a1 offset:4 gds     ; encoding: [0x04,0x00,0x33,0xdb,0x01,0x00,0x00,0x00]
+0x04,0x00,0x33,0xdb,0x01,0x00,0x00,0x00
+
+# GFX90A: ds_gws_sema_br a1 offset:65535 gds ; encoding: [0xff,0xff,0x37,0xdb,0x01,0x00,0x00,0x00]
+0xff,0xff,0x37,0xdb,0x01,0x00,0x00,0x00
+
+# GFX90A: ds_gws_sema_br a255 offset:65535 gds ; encoding: [0xff,0xff,0x37,0xdb,0xff,0x00,0x00,0x00]
+0xff,0xff,0x37,0xdb,0xff,0x00,0x00,0x00
+
+# GFX90A: ds_gws_sema_br a1 gds           ; encoding: [0x00,0x00,0x37,0xdb,0x01,0x00,0x00,0x00]
+0x00,0x00,0x37,0xdb,0x01,0x00,0x00,0x00
+
+# GFX90A: ds_gws_sema_br a1 gds           ; encoding: [0x00,0x00,0x37,0xdb,0x01,0x00,0x00,0x00]
+0x00,0x00,0x37,0xdb,0x01,0x00,0x00,0x00
+
+# GFX90A: ds_gws_sema_br a1 offset:4 gds  ; encoding: [0x04,0x00,0x37,0xdb,0x01,0x00,0x00,0x00]
+0x04,0x00,0x37,0xdb,0x01,0x00,0x00,0x00
+
+# GFX90A: ds_gws_barrier a1 offset:65535 gds ; encoding: [0xff,0xff,0x3b,0xdb,0x01,0x00,0x00,0x00]
+0xff,0xff,0x3b,0xdb,0x01,0x00,0x00,0x00
+
+# GFX90A: ds_gws_barrier a255 offset:65535 gds ; encoding: [0xff,0xff,0x3b,0xdb,0xff,0x00,0x00,0x00]
+0xff,0xff,0x3b,0xdb,0xff,0x00,0x00,0x00
+
+# GFX90A: ds_gws_barrier a1 gds           ; encoding: [0x00,0x00,0x3b,0xdb,0x01,0x00,0x00,0x00]
+0x00,0x00,0x3b,0xdb,0x01,0x00,0x00,0x00
+
+# GFX90A: ds_gws_barrier a1 gds           ; encoding: [0x00,0x00,0x3b,0xdb,0x01,0x00,0x00,0x00]
+0x00,0x00,0x3b,0xdb,0x01,0x00,0x00,0x00
+
+# GFX90A: ds_gws_barrier a1 offset:4 gds  ; encoding: [0x04,0x00,0x3b,0xdb,0x01,0x00,0x00,0x00]
+0x04,0x00,0x3b,0xdb,0x01,0x00,0x00,0x00
+
+# GFX90A: ds_consume a5 offset:65535      ; encoding: [0xff,0xff,0x7a,0xdb,0x00,0x00,0x00,0x05]
+0xff,0xff,0x7a,0xdb,0x00,0x00,0x00,0x05
+
+# GFX90A: ds_consume a255 offset:65535    ; encoding: [0xff,0xff,0x7a,0xdb,0x00,0x00,0x00,0xff]
+0xff,0xff,0x7a,0xdb,0x00,0x00,0x00,0xff
+
+# GFX90A: ds_consume a5                   ; encoding: [0x00,0x00,0x7a,0xdb,0x00,0x00,0x00,0x05]
+0x00,0x00,0x7a,0xdb,0x00,0x00,0x00,0x05
+
+# GFX90A: ds_consume a5                   ; encoding: [0x00,0x00,0x7a,0xdb,0x00,0x00,0x00,0x05]
+0x00,0x00,0x7a,0xdb,0x00,0x00,0x00,0x05
+
+# GFX90A: ds_consume a5 offset:4          ; encoding: [0x04,0x00,0x7a,0xdb,0x00,0x00,0x00,0x05]
+0x04,0x00,0x7a,0xdb,0x00,0x00,0x00,0x05
+
+# GFX90A: ds_consume a5 offset:65535 gds  ; encoding: [0xff,0xff,0x7b,0xdb,0x00,0x00,0x00,0x05]
+0xff,0xff,0x7b,0xdb,0x00,0x00,0x00,0x05
+
+# GFX90A: ds_append a5 offset:65535       ; encoding: [0xff,0xff,0x7c,0xdb,0x00,0x00,0x00,0x05]
+0xff,0xff,0x7c,0xdb,0x00,0x00,0x00,0x05
+
+# GFX90A: ds_append a255 offset:65535     ; encoding: [0xff,0xff,0x7c,0xdb,0x00,0x00,0x00,0xff]
+0xff,0xff,0x7c,0xdb,0x00,0x00,0x00,0xff
+
+# GFX90A: ds_append a5                    ; encoding: [0x00,0x00,0x7c,0xdb,0x00,0x00,0x00,0x05]
+0x00,0x00,0x7c,0xdb,0x00,0x00,0x00,0x05
+
+# GFX90A: ds_append a5                    ; encoding: [0x00,0x00,0x7c,0xdb,0x00,0x00,0x00,0x05]
+0x00,0x00,0x7c,0xdb,0x00,0x00,0x00,0x05
+
+# GFX90A: ds_append a5 offset:4           ; encoding: [0x04,0x00,0x7c,0xdb,0x00,0x00,0x00,0x05]
+0x04,0x00,0x7c,0xdb,0x00,0x00,0x00,0x05
+
+# GFX90A: ds_append a5 offset:65535 gds   ; encoding: [0xff,0xff,0x7d,0xdb,0x00,0x00,0x00,0x05]
+0xff,0xff,0x7d,0xdb,0x00,0x00,0x00,0x05
+
+# GFX90A: ds_ordered_count a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x7f,0xdb,0x01,0x00,0x00,0x05]
+0xff,0xff,0x7f,0xdb,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_ordered_count a255, v1 offset:65535 gds ; encoding: [0xff,0xff,0x7f,0xdb,0x01,0x00,0x00,0xff]
+0xff,0xff,0x7f,0xdb,0x01,0x00,0x00,0xff
+
+# GFX90A: ds_ordered_count a5, v255 offset:65535 gds ; encoding: [0xff,0xff,0x7f,0xdb,0xff,0x00,0x00,0x05]
+0xff,0xff,0x7f,0xdb,0xff,0x00,0x00,0x05
+
+# GFX90A: ds_ordered_count a5, v1 gds     ; encoding: [0x00,0x00,0x7f,0xdb,0x01,0x00,0x00,0x05]
+0x00,0x00,0x7f,0xdb,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_ordered_count a5, v1 gds     ; encoding: [0x00,0x00,0x7f,0xdb,0x01,0x00,0x00,0x05]
+0x00,0x00,0x7f,0xdb,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_ordered_count a5, v1 offset:4 gds ; encoding: [0x04,0x00,0x7f,0xdb,0x01,0x00,0x00,0x05]
+0x04,0x00,0x7f,0xdb,0x01,0x00,0x00,0x05
+
+# GFX90A: ds_write_b96 v1, a[2:4] offset:65535 ; encoding: [0xff,0xff,0xbc,0xdb,0x01,0x02,0x00,0x00]
+0xff,0xff,0xbc,0xdb,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b96 v255, a[2:4] offset:65535 ; encoding: [0xff,0xff,0xbc,0xdb,0xff,0x02,0x00,0x00]
+0xff,0xff,0xbc,0xdb,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_write_b96 v1, a[252:254] offset:65535 ; encoding: [0xff,0xff,0xbc,0xdb,0x01,0xfc,0x00,0x00]
+0xff,0xff,0xbc,0xdb,0x01,0xfc,0x00,0x00
+
+# GFX90A: ds_write_b96 v1, a[2:4]         ; encoding: [0x00,0x00,0xbc,0xdb,0x01,0x02,0x00,0x00]
+0x00,0x00,0xbc,0xdb,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b96 v1, a[2:4]         ; encoding: [0x00,0x00,0xbc,0xdb,0x01,0x02,0x00,0x00]
+0x00,0x00,0xbc,0xdb,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b96 v1, a[2:4] offset:4 ; encoding: [0x04,0x00,0xbc,0xdb,0x01,0x02,0x00,0x00]
+0x04,0x00,0xbc,0xdb,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b96 v1, a[2:4] offset:65535 gds ; encoding: [0xff,0xff,0xbd,0xdb,0x01,0x02,0x00,0x00]
+0xff,0xff,0xbd,0xdb,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b128 v1, a[2:5] offset:65535 ; encoding: [0xff,0xff,0xbe,0xdb,0x01,0x02,0x00,0x00]
+0xff,0xff,0xbe,0xdb,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b128 v255, a[2:5] offset:65535 ; encoding: [0xff,0xff,0xbe,0xdb,0xff,0x02,0x00,0x00]
+0xff,0xff,0xbe,0xdb,0xff,0x02,0x00,0x00
+
+# GFX90A: ds_write_b128 v1, a[252:255] offset:65535 ; encoding: [0xff,0xff,0xbe,0xdb,0x01,0xfc,0x00,0x00]
+0xff,0xff,0xbe,0xdb,0x01,0xfc,0x00,0x00
+
+# GFX90A: ds_write_b128 v1, a[2:5]        ; encoding: [0x00,0x00,0xbe,0xdb,0x01,0x02,0x00,0x00]
+0x00,0x00,0xbe,0xdb,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b128 v1, a[2:5]        ; encoding: [0x00,0x00,0xbe,0xdb,0x01,0x02,0x00,0x00]
+0x00,0x00,0xbe,0xdb,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b128 v1, a[2:5] offset:4 ; encoding: [0x04,0x00,0xbe,0xdb,0x01,0x02,0x00,0x00]
+0x04,0x00,0xbe,0xdb,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_write_b128 v1, a[2:5] offset:65535 gds ; encoding: [0xff,0xff,0xbf,0xdb,0x01,0x02,0x00,0x00]
+0xff,0xff,0xbf,0xdb,0x01,0x02,0x00,0x00
+
+# GFX90A: ds_read_b96 a[6:8], v1 offset:65535 ; encoding: [0xff,0xff,0xfc,0xdb,0x01,0x00,0x00,0x06]
+0xff,0xff,0xfc,0xdb,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read_b96 a[252:254], v1 offset:65535 ; encoding: [0xff,0xff,0xfc,0xdb,0x01,0x00,0x00,0xfc]
+0xff,0xff,0xfc,0xdb,0x01,0x00,0x00,0xfc
+
+# GFX90A: ds_read_b96 a[6:8], v255 offset:65535 ; encoding: [0xff,0xff,0xfc,0xdb,0xff,0x00,0x00,0x06]
+0xff,0xff,0xfc,0xdb,0xff,0x00,0x00,0x06
+
+# GFX90A: ds_read_b96 a[6:8], v1          ; encoding: [0x00,0x00,0xfc,0xdb,0x01,0x00,0x00,0x06]
+0x00,0x00,0xfc,0xdb,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read_b96 a[6:8], v1          ; encoding: [0x00,0x00,0xfc,0xdb,0x01,0x00,0x00,0x06]
+0x00,0x00,0xfc,0xdb,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read_b96 a[6:8], v1 offset:4 ; encoding: [0x04,0x00,0xfc,0xdb,0x01,0x00,0x00,0x06]
+0x04,0x00,0xfc,0xdb,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read_b96 a[6:8], v1 offset:65535 gds ; encoding: [0xff,0xff,0xfd,0xdb,0x01,0x00,0x00,0x06]
+0xff,0xff,0xfd,0xdb,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read_b128 a[6:9], v1 offset:65535 ; encoding: [0xff,0xff,0xfe,0xdb,0x01,0x00,0x00,0x06]
+0xff,0xff,0xfe,0xdb,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read_b128 a[252:255], v1 offset:65535 ; encoding: [0xff,0xff,0xfe,0xdb,0x01,0x00,0x00,0xfc]
+0xff,0xff,0xfe,0xdb,0x01,0x00,0x00,0xfc
+
+# GFX90A: ds_read_b128 a[6:9], v255 offset:65535 ; encoding: [0xff,0xff,0xfe,0xdb,0xff,0x00,0x00,0x06]
+0xff,0xff,0xfe,0xdb,0xff,0x00,0x00,0x06
+
+# GFX90A: ds_read_b128 a[6:9], v1         ; encoding: [0x00,0x00,0xfe,0xdb,0x01,0x00,0x00,0x06]
+0x00,0x00,0xfe,0xdb,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read_b128 a[6:9], v1         ; encoding: [0x00,0x00,0xfe,0xdb,0x01,0x00,0x00,0x06]
+0x00,0x00,0xfe,0xdb,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read_b128 a[6:9], v1 offset:4 ; encoding: [0x04,0x00,0xfe,0xdb,0x01,0x00,0x00,0x06]
+0x04,0x00,0xfe,0xdb,0x01,0x00,0x00,0x06
+
+# GFX90A: ds_read_b128 a[6:9], v1 offset:65535 gds ; encoding: [0xff,0xff,0xff,0xdb,0x01,0x00,0x00,0x06]
+0xff,0xff,0xff,0xdb,0x01,0x00,0x00,0x06
+
+# GFX90A: image_load a5, v2, s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x01,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x01,0x01,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_load a252, v2, s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x01,0xf0,0x02,0xfc,0x02,0x00]
+0x00,0x01,0x01,0xf0,0x02,0xfc,0x02,0x00
+
+# GFX90A: image_load a5, v252, s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x01,0xf0,0xfc,0x05,0x02,0x00]
+0x00,0x01,0x01,0xf0,0xfc,0x05,0x02,0x00
+
+# GFX90A: image_load a5, v2, s[12:19] dmask:0x1 ; encoding: [0x00,0x01,0x01,0xf0,0x02,0x05,0x03,0x00]
+0x00,0x01,0x01,0xf0,0x02,0x05,0x03,0x00
+
+# GFX90A: image_load a5, v2, s[92:99] dmask:0x1 ; encoding: [0x00,0x01,0x01,0xf0,0x02,0x05,0x17,0x00]
+0x00,0x01,0x01,0xf0,0x02,0x05,0x17,0x00
+
+# GFX90A: image_load a5, v2, s[8:15] dmask:0x2 ; encoding: [0x00,0x02,0x01,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x02,0x01,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_load a6, v2, s[8:15] dmask:0x3 ; encoding: [0x00,0x03,0x01,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x03,0x01,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_load a5, v2, s[8:15] dmask:0x4 ; encoding: [0x00,0x04,0x01,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x04,0x01,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_load a6, v2, s[8:15] dmask:0x5 ; encoding: [0x00,0x05,0x01,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x05,0x01,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_load a6, v2, s[8:15] dmask:0x6 ; encoding: [0x00,0x06,0x01,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x06,0x01,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_load a6, v2, s[8:15] dmask:0x7 ; encoding: [0x00,0x07,0x01,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x07,0x01,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_load a5, v2, s[8:15] dmask:0x8 ; encoding: [0x00,0x08,0x01,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x08,0x01,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_load a6, v2, s[8:15] dmask:0x9 ; encoding: [0x00,0x09,0x01,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x09,0x01,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_load a6, v2, s[8:15] dmask:0xa ; encoding: [0x00,0x0a,0x01,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x0a,0x01,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_load a6, v2, s[8:15] dmask:0xb ; encoding: [0x00,0x0b,0x01,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x0b,0x01,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_load a6, v2, s[8:15] dmask:0xc ; encoding: [0x00,0x0c,0x01,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x0c,0x01,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_load a6, v2, s[8:15] dmask:0xd ; encoding: [0x00,0x0d,0x01,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x0d,0x01,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_load a6, v2, s[8:15] dmask:0xe ; encoding: [0x00,0x0e,0x01,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x0e,0x01,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_load a5, v2, s[8:15]  ; encoding: [0x00,0x00,0x01,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x00,0x01,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_load a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x01,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x01,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_load a5, v2, s[8:15] dmask:0x1 glc ; encoding: [0x00,0x21,0x01,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x21,0x01,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_load a5, v2, s[8:15] dmask:0x1 slc ; encoding: [0x00,0x01,0x01,0xf2,0x02,0x05,0x02,0x00]
+0x00,0x01,0x01,0xf2,0x02,0x05,0x02,0x00
+
+# GFX90A: image_load a5, v2, s[8:15] dmask:0x1 lwe ; encoding: [0x00,0x01,0x03,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x01,0x03,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_load a5, v2, s[8:15] dmask:0x1 da ; encoding: [0x00,0x41,0x01,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x41,0x01,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_load a5, v2, s[8:15] dmask:0x1 d16 ; encoding: [0x00,0x01,0x01,0xf0,0x02,0x05,0x02,0x80]
+0x00,0x01,0x01,0xf0,0x02,0x05,0x02,0x80
+
+# GFX90A: image_store a1, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x21,0xf0,0x02,0x01,0x03,0x00]
+0x00,0x11,0x21,0xf0,0x02,0x01,0x03,0x00
+
+# GFX90A: image_store a252, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x21,0xf0,0x02,0xfc,0x03,0x00]
+0x00,0x11,0x21,0xf0,0x02,0xfc,0x03,0x00
+
+# GFX90A: image_store a1, v252, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x21,0xf0,0xfc,0x01,0x03,0x00]
+0x00,0x11,0x21,0xf0,0xfc,0x01,0x03,0x00
+
+# GFX90A: image_store a1, v2, s[16:23] dmask:0x1 unorm ; encoding: [0x00,0x11,0x21,0xf0,0x02,0x01,0x04,0x00]
+0x00,0x11,0x21,0xf0,0x02,0x01,0x04,0x00
+
+# GFX90A: image_store a1, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x21,0xf0,0x02,0x01,0x17,0x00]
+0x00,0x11,0x21,0xf0,0x02,0x01,0x17,0x00
+
+# GFX90A: image_store a1, v2, s[12:19] dmask:0x2 unorm ; encoding: [0x00,0x12,0x21,0xf0,0x02,0x01,0x03,0x00]
+0x00,0x12,0x21,0xf0,0x02,0x01,0x03,0x00
+
+# GFX90A: image_store a2, v2, s[12:19] dmask:0x3 unorm ; encoding: [0x00,0x13,0x21,0xf0,0x02,0x02,0x03,0x00]
+0x00,0x13,0x21,0xf0,0x02,0x02,0x03,0x00
+
+# GFX90A: image_store a1, v2, s[12:19] dmask:0x4 unorm ; encoding: [0x00,0x14,0x21,0xf0,0x02,0x01,0x03,0x00]
+0x00,0x14,0x21,0xf0,0x02,0x01,0x03,0x00
+
+# GFX90A: image_store a2, v2, s[12:19] dmask:0x5 unorm ; encoding: [0x00,0x15,0x21,0xf0,0x02,0x02,0x03,0x00]
+0x00,0x15,0x21,0xf0,0x02,0x02,0x03,0x00
+
+# GFX90A: image_store a2, v2, s[12:19] dmask:0x6 unorm ; encoding: [0x00,0x16,0x21,0xf0,0x02,0x02,0x03,0x00]
+0x00,0x16,0x21,0xf0,0x02,0x02,0x03,0x00
+
+# GFX90A: image_store a2, v2, s[12:19] dmask:0x7 unorm ; encoding: [0x00,0x17,0x21,0xf0,0x02,0x02,0x03,0x00]
+0x00,0x17,0x21,0xf0,0x02,0x02,0x03,0x00
+
+# GFX90A: image_store a1, v2, s[12:19] dmask:0x8 unorm ; encoding: [0x00,0x18,0x21,0xf0,0x02,0x01,0x03,0x00]
+0x00,0x18,0x21,0xf0,0x02,0x01,0x03,0x00
+
+# GFX90A: image_store a2, v2, s[12:19] dmask:0x9 unorm ; encoding: [0x00,0x19,0x21,0xf0,0x02,0x02,0x03,0x00]
+0x00,0x19,0x21,0xf0,0x02,0x02,0x03,0x00
+
+# GFX90A: image_store a2, v2, s[12:19] dmask:0xa unorm ; encoding: [0x00,0x1a,0x21,0xf0,0x02,0x02,0x03,0x00]
+0x00,0x1a,0x21,0xf0,0x02,0x02,0x03,0x00
+
+# GFX90A: image_store a2, v2, s[12:19] dmask:0xb unorm ; encoding: [0x00,0x1b,0x21,0xf0,0x02,0x02,0x03,0x00]
+0x00,0x1b,0x21,0xf0,0x02,0x02,0x03,0x00
+
+# GFX90A: image_store a2, v2, s[12:19] dmask:0xc unorm ; encoding: [0x00,0x1c,0x21,0xf0,0x02,0x02,0x03,0x00]
+0x00,0x1c,0x21,0xf0,0x02,0x02,0x03,0x00
+
+# GFX90A: image_store a2, v2, s[12:19] dmask:0xd unorm ; encoding: [0x00,0x1d,0x21,0xf0,0x02,0x02,0x03,0x00]
+0x00,0x1d,0x21,0xf0,0x02,0x02,0x03,0x00
+
+# GFX90A: image_store a2, v2, s[12:19] dmask:0xe unorm ; encoding: [0x00,0x1e,0x21,0xf0,0x02,0x02,0x03,0x00]
+0x00,0x1e,0x21,0xf0,0x02,0x02,0x03,0x00
+
+# GFX90A: image_store a2, v2, s[12:19] dmask:0xf unorm ; encoding: [0x00,0x1f,0x21,0xf0,0x02,0x02,0x03,0x00]
+0x00,0x1f,0x21,0xf0,0x02,0x02,0x03,0x00
+
+# GFX90A: image_store a1, v2, s[12:19] unorm ; encoding: [0x00,0x10,0x21,0xf0,0x02,0x01,0x03,0x00]
+0x00,0x10,0x21,0xf0,0x02,0x01,0x03,0x00
+
+# GFX90A: image_store a1, v2, s[12:19] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x21,0xf0,0x02,0x01,0x03,0x00]
+0x00,0x31,0x21,0xf0,0x02,0x01,0x03,0x00
+
+# GFX90A: image_store a1, v2, s[12:19] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x21,0xf2,0x02,0x01,0x03,0x00]
+0x00,0x11,0x21,0xf2,0x02,0x01,0x03,0x00
+
+# GFX90A: image_store a1, v2, s[12:19] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x23,0xf0,0x02,0x01,0x03,0x00]
+0x00,0x11,0x23,0xf0,0x02,0x01,0x03,0x00
+
+# GFX90A: image_store a1, v2, s[12:19] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x21,0xf0,0x02,0x01,0x03,0x00]
+0x00,0x51,0x21,0xf0,0x02,0x01,0x03,0x00
+
+# GFX90A: image_store a1, v2, s[12:19] dmask:0x1 unorm d16 ; encoding: [0x00,0x11,0x21,0xf0,0x02,0x01,0x03,0x80]
+0x00,0x11,0x21,0xf0,0x02,0x01,0x03,0x80
+
+# GFX90A: image_atomic_swap a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x41,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_swap a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0x02,0xfc,0x02,0x00]
+0x00,0x11,0x41,0xf0,0x02,0xfc,0x02,0x00
+
+# GFX90A: image_atomic_swap a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0xfc,0x05,0x02,0x00]
+0x00,0x11,0x41,0xf0,0xfc,0x05,0x02,0x00
+
+# GFX90A: image_atomic_swap a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0x02,0x05,0x03,0x00]
+0x00,0x11,0x41,0xf0,0x02,0x05,0x03,0x00
+
+# GFX90A: image_atomic_swap a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0x02,0x05,0x17,0x00]
+0x00,0x11,0x41,0xf0,0x02,0x05,0x17,0x00
+
+# GFX90A: image_atomic_swap a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x41,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x13,0x41,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_atomic_swap a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x41,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x31,0x41,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_swap a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x41,0xf2,0x02,0x05,0x02,0x00]
+0x00,0x11,0x41,0xf2,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_swap a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x43,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x43,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_swap a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x41,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x51,0x41,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_cmpswap a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x13,0x45,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_atomic_cmpswap a[252:253], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0x02,0xfc,0x02,0x00]
+0x00,0x13,0x45,0xf0,0x02,0xfc,0x02,0x00
+
+# GFX90A: image_atomic_cmpswap a[6:7], v252, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0xfc,0x06,0x02,0x00]
+0x00,0x13,0x45,0xf0,0xfc,0x06,0x02,0x00
+
+# GFX90A: image_atomic_cmpswap a[6:7], v2, s[12:19] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0x02,0x06,0x03,0x00]
+0x00,0x13,0x45,0xf0,0x02,0x06,0x03,0x00
+
+# GFX90A: image_atomic_cmpswap a[6:7], v2, s[92:99] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0x02,0x06,0x17,0x00]
+0x00,0x13,0x45,0xf0,0x02,0x06,0x17,0x00
+
+# GFX90A: image_atomic_cmpswap a[6:7], v2, s[8:15] dmask:0xf unorm ; encoding: [0x00,0x1f,0x45,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x1f,0x45,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_atomic_cmpswap a[6:7], v2, s[8:15] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x45,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x33,0x45,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_atomic_cmpswap a[6:7], v2, s[8:15] dmask:0x3 unorm slc ; encoding: [0x00,0x13,0x45,0xf2,0x02,0x06,0x02,0x00]
+0x00,0x13,0x45,0xf2,0x02,0x06,0x02,0x00
+
+# GFX90A: image_atomic_cmpswap a[6:7], v2, s[8:15] dmask:0x3 unorm lwe ; encoding: [0x00,0x13,0x47,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x13,0x47,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_atomic_cmpswap a[6:7], v2, s[8:15] dmask:0x3 unorm da ; encoding: [0x00,0x53,0x45,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x53,0x45,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_atomic_add a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x49,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_add a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0x02,0xfc,0x02,0x00]
+0x00,0x11,0x49,0xf0,0x02,0xfc,0x02,0x00
+
+# GFX90A: image_atomic_add a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0xfc,0x05,0x02,0x00]
+0x00,0x11,0x49,0xf0,0xfc,0x05,0x02,0x00
+
+# GFX90A: image_atomic_add a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0x02,0x05,0x03,0x00]
+0x00,0x11,0x49,0xf0,0x02,0x05,0x03,0x00
+
+# GFX90A: image_atomic_add a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0x02,0x05,0x17,0x00]
+0x00,0x11,0x49,0xf0,0x02,0x05,0x17,0x00
+
+# GFX90A: image_atomic_add a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x49,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x13,0x49,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_atomic_add a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x49,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x31,0x49,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_add a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x49,0xf2,0x02,0x05,0x02,0x00]
+0x00,0x11,0x49,0xf2,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_add a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x4b,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x4b,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_add a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x49,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x51,0x49,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_sub a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x4d,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_sub a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0x02,0xfc,0x02,0x00]
+0x00,0x11,0x4d,0xf0,0x02,0xfc,0x02,0x00
+
+# GFX90A: image_atomic_sub a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0xfc,0x05,0x02,0x00]
+0x00,0x11,0x4d,0xf0,0xfc,0x05,0x02,0x00
+
+# GFX90A: image_atomic_sub a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0x02,0x05,0x03,0x00]
+0x00,0x11,0x4d,0xf0,0x02,0x05,0x03,0x00
+
+# GFX90A: image_atomic_sub a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0x02,0x05,0x17,0x00]
+0x00,0x11,0x4d,0xf0,0x02,0x05,0x17,0x00
+
+# GFX90A: image_atomic_sub a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x4d,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x13,0x4d,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_atomic_sub a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x4d,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x31,0x4d,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_sub a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x4d,0xf2,0x02,0x05,0x02,0x00]
+0x00,0x11,0x4d,0xf2,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_sub a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x4f,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x4f,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_sub a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x4d,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x51,0x4d,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_smin a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x51,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_smin a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0x02,0xfc,0x02,0x00]
+0x00,0x11,0x51,0xf0,0x02,0xfc,0x02,0x00
+
+# GFX90A: image_atomic_smin a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0xfc,0x05,0x02,0x00]
+0x00,0x11,0x51,0xf0,0xfc,0x05,0x02,0x00
+
+# GFX90A: image_atomic_smin a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0x02,0x05,0x03,0x00]
+0x00,0x11,0x51,0xf0,0x02,0x05,0x03,0x00
+
+# GFX90A: image_atomic_smin a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0x02,0x05,0x17,0x00]
+0x00,0x11,0x51,0xf0,0x02,0x05,0x17,0x00
+
+# GFX90A: image_atomic_smin a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x51,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x13,0x51,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_atomic_smin a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x51,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x31,0x51,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_smin a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x51,0xf2,0x02,0x05,0x02,0x00]
+0x00,0x11,0x51,0xf2,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_smin a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x53,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x53,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_smin a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x51,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x51,0x51,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_umin a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x55,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_umin a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0x02,0xfc,0x02,0x00]
+0x00,0x11,0x55,0xf0,0x02,0xfc,0x02,0x00
+
+# GFX90A: image_atomic_umin a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0xfc,0x05,0x02,0x00]
+0x00,0x11,0x55,0xf0,0xfc,0x05,0x02,0x00
+
+# GFX90A: image_atomic_umin a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0x02,0x05,0x03,0x00]
+0x00,0x11,0x55,0xf0,0x02,0x05,0x03,0x00
+
+# GFX90A: image_atomic_umin a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0x02,0x05,0x17,0x00]
+0x00,0x11,0x55,0xf0,0x02,0x05,0x17,0x00
+
+# GFX90A: image_atomic_umin a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x55,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x13,0x55,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_atomic_umin a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x55,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x31,0x55,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_umin a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x55,0xf2,0x02,0x05,0x02,0x00]
+0x00,0x11,0x55,0xf2,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_umin a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x57,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x57,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_umin a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x55,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x51,0x55,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_smax a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x59,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_smax a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0x02,0xfc,0x02,0x00]
+0x00,0x11,0x59,0xf0,0x02,0xfc,0x02,0x00
+
+# GFX90A: image_atomic_smax a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0xfc,0x05,0x02,0x00]
+0x00,0x11,0x59,0xf0,0xfc,0x05,0x02,0x00
+
+# GFX90A: image_atomic_smax a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0x02,0x05,0x03,0x00]
+0x00,0x11,0x59,0xf0,0x02,0x05,0x03,0x00
+
+# GFX90A: image_atomic_smax a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0x02,0x05,0x17,0x00]
+0x00,0x11,0x59,0xf0,0x02,0x05,0x17,0x00
+
+# GFX90A: image_atomic_smax a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x59,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x13,0x59,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_atomic_smax a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x59,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x31,0x59,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_smax a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x59,0xf2,0x02,0x05,0x02,0x00]
+0x00,0x11,0x59,0xf2,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_smax a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x5b,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x5b,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_smax a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x59,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x51,0x59,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_umax a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x5d,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_umax a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0x02,0xfc,0x02,0x00]
+0x00,0x11,0x5d,0xf0,0x02,0xfc,0x02,0x00
+
+# GFX90A: image_atomic_umax a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0xfc,0x05,0x02,0x00]
+0x00,0x11,0x5d,0xf0,0xfc,0x05,0x02,0x00
+
+# GFX90A: image_atomic_umax a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0x02,0x05,0x03,0x00]
+0x00,0x11,0x5d,0xf0,0x02,0x05,0x03,0x00
+
+# GFX90A: image_atomic_umax a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0x02,0x05,0x17,0x00]
+0x00,0x11,0x5d,0xf0,0x02,0x05,0x17,0x00
+
+# GFX90A: image_atomic_umax a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x5d,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x13,0x5d,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_atomic_umax a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x5d,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x31,0x5d,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_umax a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x5d,0xf2,0x02,0x05,0x02,0x00]
+0x00,0x11,0x5d,0xf2,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_umax a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x5f,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x5f,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_umax a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x5d,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x51,0x5d,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_and a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x61,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_and a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0x02,0xfc,0x02,0x00]
+0x00,0x11,0x61,0xf0,0x02,0xfc,0x02,0x00
+
+# GFX90A: image_atomic_and a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0xfc,0x05,0x02,0x00]
+0x00,0x11,0x61,0xf0,0xfc,0x05,0x02,0x00
+
+# GFX90A: image_atomic_and a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0x02,0x05,0x03,0x00]
+0x00,0x11,0x61,0xf0,0x02,0x05,0x03,0x00
+
+# GFX90A: image_atomic_and a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0x02,0x05,0x17,0x00]
+0x00,0x11,0x61,0xf0,0x02,0x05,0x17,0x00
+
+# GFX90A: image_atomic_and a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x61,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x13,0x61,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_atomic_and a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x61,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x31,0x61,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_and a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x61,0xf2,0x02,0x05,0x02,0x00]
+0x00,0x11,0x61,0xf2,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_and a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x63,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x63,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_and a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x61,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x51,0x61,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_or a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x65,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_or a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0x02,0xfc,0x02,0x00]
+0x00,0x11,0x65,0xf0,0x02,0xfc,0x02,0x00
+
+# GFX90A: image_atomic_or a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0xfc,0x05,0x02,0x00]
+0x00,0x11,0x65,0xf0,0xfc,0x05,0x02,0x00
+
+# GFX90A: image_atomic_or a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0x02,0x05,0x03,0x00]
+0x00,0x11,0x65,0xf0,0x02,0x05,0x03,0x00
+
+# GFX90A: image_atomic_or a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0x02,0x05,0x17,0x00]
+0x00,0x11,0x65,0xf0,0x02,0x05,0x17,0x00
+
+# GFX90A: image_atomic_or a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x65,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x13,0x65,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_atomic_or a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x65,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x31,0x65,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_or a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x65,0xf2,0x02,0x05,0x02,0x00]
+0x00,0x11,0x65,0xf2,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_or a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x67,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x67,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_or a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x65,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x51,0x65,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_xor a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x69,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_xor a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0x02,0xfc,0x02,0x00]
+0x00,0x11,0x69,0xf0,0x02,0xfc,0x02,0x00
+
+# GFX90A: image_atomic_xor a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0xfc,0x05,0x02,0x00]
+0x00,0x11,0x69,0xf0,0xfc,0x05,0x02,0x00
+
+# GFX90A: image_atomic_xor a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0x02,0x05,0x03,0x00]
+0x00,0x11,0x69,0xf0,0x02,0x05,0x03,0x00
+
+# GFX90A: image_atomic_xor a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0x02,0x05,0x17,0x00]
+0x00,0x11,0x69,0xf0,0x02,0x05,0x17,0x00
+
+# GFX90A: image_atomic_xor a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x69,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x13,0x69,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_atomic_xor a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x69,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x31,0x69,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_xor a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x69,0xf2,0x02,0x05,0x02,0x00]
+0x00,0x11,0x69,0xf2,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_xor a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x6b,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x6b,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_xor a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x69,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x51,0x69,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_inc a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x6d,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_inc a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0x02,0xfc,0x02,0x00]
+0x00,0x11,0x6d,0xf0,0x02,0xfc,0x02,0x00
+
+# GFX90A: image_atomic_inc a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0xfc,0x05,0x02,0x00]
+0x00,0x11,0x6d,0xf0,0xfc,0x05,0x02,0x00
+
+# GFX90A: image_atomic_inc a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0x02,0x05,0x03,0x00]
+0x00,0x11,0x6d,0xf0,0x02,0x05,0x03,0x00
+
+# GFX90A: image_atomic_inc a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0x02,0x05,0x17,0x00]
+0x00,0x11,0x6d,0xf0,0x02,0x05,0x17,0x00
+
+# GFX90A: image_atomic_inc a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x6d,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x13,0x6d,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_atomic_inc a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x6d,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x31,0x6d,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_inc a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x6d,0xf2,0x02,0x05,0x02,0x00]
+0x00,0x11,0x6d,0xf2,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_inc a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x6f,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x6f,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_inc a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x6d,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x51,0x6d,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_dec a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x71,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_dec a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0x02,0xfc,0x02,0x00]
+0x00,0x11,0x71,0xf0,0x02,0xfc,0x02,0x00
+
+# GFX90A: image_atomic_dec a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0xfc,0x05,0x02,0x00]
+0x00,0x11,0x71,0xf0,0xfc,0x05,0x02,0x00
+
+# GFX90A: image_atomic_dec a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0x02,0x05,0x03,0x00]
+0x00,0x11,0x71,0xf0,0x02,0x05,0x03,0x00
+
+# GFX90A: image_atomic_dec a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0x02,0x05,0x17,0x00]
+0x00,0x11,0x71,0xf0,0x02,0x05,0x17,0x00
+
+# GFX90A: image_atomic_dec a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x71,0xf0,0x02,0x06,0x02,0x00]
+0x00,0x13,0x71,0xf0,0x02,0x06,0x02,0x00
+
+# GFX90A: image_atomic_dec a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x71,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x31,0x71,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_dec a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x71,0xf2,0x02,0x05,0x02,0x00]
+0x00,0x11,0x71,0xf2,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_dec a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x73,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x11,0x73,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_atomic_dec a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x71,0xf0,0x02,0x05,0x02,0x00]
+0x00,0x51,0x71,0xf0,0x02,0x05,0x02,0x00
+
+# GFX90A: image_sample a5, v0, s[8:15], s[12:15] dmask:0x1 ; encoding: [0x00,0x01,0x81,0xf0,0x00,0x05,0x62,0x00]
+0x00,0x01,0x81,0xf0,0x00,0x05,0x62,0x00

diff  --git a/llvm/test/MC/Disassembler/AMDGPU/mai-gfx90a.txt b/llvm/test/MC/Disassembler/AMDGPU/mai-gfx90a.txt
new file mode 100644
index 000000000000..ed6a60b634ee
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/mai-gfx90a.txt
@@ -0,0 +1,2512 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -show-encoding -disassemble %s | FileCheck -check-prefix=GFX90A %s
+
+# GFX90A: v_accvgpr_read_b32 v2, a0 ; encoding: [0x02,0x40,0xd8,0xd3,0x00,0x01,0x00,0x18]
+0x02,0x40,0xd8,0xd3,0x00,0x01,0x00,0x18
+
+# GFX90A: v_accvgpr_read_b32 v2, a1 ; encoding: [0x02,0x40,0xd8,0xd3,0x01,0x01,0x00,0x18]
+0x02,0x40,0xd8,0xd3,0x01,0x01,0x00,0x18
+
+# GFX90A: v_accvgpr_read_b32 v2, a255 ; encoding: [0x02,0x40,0xd8,0xd3,0xff,0x01,0x00,0x18]
+0x02,0x40,0xd8,0xd3,0xff,0x01,0x00,0x18
+
+# GFX90A: v_accvgpr_write_b32 a2, -2.0 ; encoding: [0x02,0x40,0xd9,0xd3,0xf5,0x00,0x00,0x18]
+0x02,0x40,0xd9,0xd3,0xf5,0x00,0x00,0x18
+
+# GFX90A: v_accvgpr_write_b32 a2, -2 ; encoding: [0x02,0x40,0xd9,0xd3,0xc2,0x00,0x00,0x18]
+0x02,0x40,0xd9,0xd3,0xc2,0x00,0x00,0x18
+
+# GFX90A: v_accvgpr_write_b32 a2, v1 ; encoding: [0x02,0x40,0xd9,0xd3,0x01,0x01,0x00,0x18]
+0x02,0x40,0xd9,0xd3,0x01,0x01,0x00,0x18
+
+# GFX90A: v_accvgpr_mov_b32 a1, a2 ; encoding: [0x02,0xa5,0x02,0x7e]
+0x02,0xa5,0x02,0x7e
+
+# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[2:33] ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, a1, a[2:33] ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, v1, a[2:33] ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, a1, a[2:33] ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, v1, v[2:33] ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, a1, v[2:33] ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, v1, v[2:33] ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, a1, v[2:33] ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, -2.0 ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x03]
+0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x03
+
+# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, v1, -2.0 ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x03]
+0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x03
+
+# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xe3]
+0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xe3]
+0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, a1, -2.0 ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x13]
+0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x13
+
+# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, a1, -2.0 ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x13]
+0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x13
+
+# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xf3]
+0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xf3]
+0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, v1, -2.0 ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x0b]
+0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, v1, -2.0 ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x0b]
+0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xeb]
+0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xeb]
+0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, a1, -2.0 ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x1b]
+0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, a1, -2.0 ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x1b]
+0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xfb]
+0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xfb]
+0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, -2.0 ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x03]
+0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x03
+
+# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, v1, -2.0 ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x03]
+0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x03
+
+# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xe3]
+0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xe3]
+0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, a1, -2.0 ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x13]
+0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x13
+
+# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, a1, -2.0 ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x13]
+0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x13
+
+# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xf3]
+0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xf3]
+0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, v1, -2.0 ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x0b]
+0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, v1, -2.0 ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x0b]
+0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xeb]
+0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xeb]
+0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, a1, -2.0 ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x1b]
+0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, a1, -2.0 ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x1b]
+0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xfb]
+0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xfb]
+0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, -2.0 ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x03]
+0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x03
+
+# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, v1, -2.0 ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x03]
+0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x03
+
+# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xe3]
+0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xe3]
+0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, a1, -2.0 ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x13]
+0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x13
+
+# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, a1, -2.0 ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x13]
+0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x13
+
+# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xf3]
+0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xf3]
+0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, v1, -2.0 ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x0b]
+0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, v1, -2.0 ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x0b]
+0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xeb]
+0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xeb]
+0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, a1, -2.0 ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x1b]
+0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, a1, -2.0 ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x1b]
+0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xfb]
+0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xfb]
+0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, -2.0 ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x03]
+0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x03
+
+# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, v1, -2.0 ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x03]
+0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x03
+
+# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xe3]
+0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xe3]
+0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, a1, -2.0 ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x13]
+0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x13
+
+# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, a1, -2.0 ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x13]
+0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x13
+
+# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xf3]
+0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xf3]
+0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, v1, -2.0 ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x0b]
+0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, v1, -2.0 ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x0b]
+0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xeb]
+0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xeb]
+0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, a1, -2.0 ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x1b]
+0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, a1, -2.0 ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x1b]
+0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xfb]
+0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xfb]
+0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, -2.0 ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x03]
+0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x03
+
+# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, v1, -2.0 ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x03]
+0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x03
+
+# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xe3]
+0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xe3]
+0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, a1, -2.0 ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x13]
+0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x13
+
+# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, a1, -2.0 ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x13]
+0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x13
+
+# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xf3]
+0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xf3]
+0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, v1, -2.0 ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x0b]
+0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, v1, -2.0 ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x0b]
+0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xeb]
+0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xeb]
+0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, a1, -2.0 ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x1b]
+0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, a1, -2.0 ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x1b]
+0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xfb]
+0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xfb]
+0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[2:33] ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], a[2:33] ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x14]
+0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x14
+
+# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], a[2:33] ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xec]
+0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xec
+
+# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], a[2:33] ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x1c]
+0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xfc]
+0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], v[2:33] ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], v[2:33] ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x14]
+0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x14
+
+# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], v[2:33] ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xec]
+0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xec
+
+# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], v[2:33] ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x1c]
+0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xfc]
+0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x13]
+0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x13
+
+# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x13]
+0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x13
+
+# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xf3]
+0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xf3]
+0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x0b]
+0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x0b]
+0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xeb]
+0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xeb]
+0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xfb]
+0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xfb]
+0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x14]
+0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x14
+
+# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xec]
+0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xec
+
+# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x1c]
+0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xfc]
+0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x14]
+0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x14
+
+# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xec]
+0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xec
+
+# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x1c]
+0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xfc]
+0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x13]
+0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x13
+
+# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x13]
+0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x13
+
+# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xf3]
+0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xf3]
+0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x0b]
+0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x0b]
+0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xeb]
+0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xeb]
+0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xfb]
+0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xfb]
+0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x14]
+0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x14
+
+# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xec]
+0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xec
+
+# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x1c]
+0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xfc]
+0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x14]
+0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x14
+
+# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xec]
+0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xec
+
+# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x1c]
+0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xfc]
+0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x13]
+0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x13
+
+# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x13]
+0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x13
+
+# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xf3]
+0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xf3]
+0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x0b]
+0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x0b]
+0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xeb]
+0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xeb]
+0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xfb]
+0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xfb]
+0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x14]
+0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x14
+
+# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xec]
+0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xec
+
+# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x1c]
+0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xfc]
+0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x14]
+0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x14
+
+# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xec]
+0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xec
+
+# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x1c]
+0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xfc]
+0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x13]
+0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x13
+
+# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x13]
+0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x13
+
+# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xf3]
+0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xf3]
+0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x0b]
+0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x0b]
+0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xeb]
+0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xeb]
+0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xfb]
+0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xfb]
+0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x14]
+0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x14
+
+# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xec]
+0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xec
+
+# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x1c]
+0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xfc]
+0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x14]
+0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x14
+
+# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xec]
+0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xec
+
+# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x1c]
+0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xfc]
+0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x13]
+0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x13
+
+# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x13]
+0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x13
+
+# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xf3]
+0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xf3]
+0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x0b]
+0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x0b]
+0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xeb]
+0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xeb]
+0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xfb]
+0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xfb]
+0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xfb
+
+# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, v1, a[2:33] ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, a1, a[2:33] ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, v1, a[2:33] ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, a1, a[2:33] ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, v1, v[2:33] ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, a1, v[2:33] ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, v1, v[2:33] ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, a1, v[2:33] ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, v1, 2 ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x02]
+0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x02
+
+# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, v1, 2 ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x02]
+0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x02
+
+# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xe2]
+0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xe2
+
+# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xe2]
+0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xe2
+
+# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, a1, 2 ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x12]
+0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x12
+
+# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, a1, 2 ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x12]
+0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x12
+
+# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xf2]
+0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xf2
+
+# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xf2]
+0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xf2
+
+# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, v1, 2 ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x0a]
+0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x0a
+
+# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, v1, 2 ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x0a]
+0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x0a
+
+# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xea]
+0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xea
+
+# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xea]
+0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xea
+
+# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, a1, 2 ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x1a]
+0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x1a
+
+# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, a1, 2 ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x1a]
+0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x1a
+
+# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xfa]
+0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xfa
+
+# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xfa]
+0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xfa
+
+# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 2 ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x02]
+0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x02
+
+# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, v1, 2 ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x02]
+0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x02
+
+# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xe2]
+0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xe2
+
+# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xe2]
+0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xe2
+
+# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, a1, 2 ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x12]
+0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x12
+
+# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, a1, 2 ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x12]
+0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x12
+
+# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xf2]
+0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xf2
+
+# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xf2]
+0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xf2
+
+# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, v1, 2 ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x0a]
+0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x0a
+
+# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, v1, 2 ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x0a]
+0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x0a
+
+# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xea]
+0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xea
+
+# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xea]
+0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xea
+
+# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, a1, 2 ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x1a]
+0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x1a
+
+# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, a1, 2 ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x1a]
+0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x1a
+
+# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xfa]
+0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xfa
+
+# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xfa]
+0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xfa
+
+# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, 2 ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x02]
+0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x02
+
+# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, v1, 2 ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x02]
+0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x02
+
+# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xe2]
+0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xe2
+
+# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xe2]
+0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xe2
+
+# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, a1, 2 ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x12]
+0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x12
+
+# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, a1, 2 ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x12]
+0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x12
+
+# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xf2]
+0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xf2
+
+# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xf2]
+0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xf2
+
+# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, v1, 2 ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x0a]
+0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x0a
+
+# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, v1, 2 ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x0a]
+0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x0a
+
+# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xea]
+0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xea
+
+# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xea]
+0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xea
+
+# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, a1, 2 ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x1a]
+0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x1a
+
+# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, a1, 2 ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x1a]
+0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x1a
+
+# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xfa]
+0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xfa
+
+# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xfa]
+0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xfa
+
+# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, 2 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x02]
+0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x02
+
+# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, v1, 2 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x02]
+0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x02
+
+# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xe2]
+0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xe2
+
+# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xe2]
+0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xe2
+
+# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, a1, 2 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x12]
+0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x12
+
+# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, a1, 2 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x12]
+0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x12
+
+# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xf2]
+0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xf2
+
+# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xf2]
+0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xf2
+
+# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, v1, 2 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x0a]
+0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x0a
+
+# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, v1, 2 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x0a]
+0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x0a
+
+# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xea]
+0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xea
+
+# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xea]
+0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xea
+
+# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, a1, 2 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x1a]
+0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x1a
+
+# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, a1, 2 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x1a]
+0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x1a
+
+# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xfa]
+0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xfa
+
+# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xfa]
+0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xfa
+
+# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, 2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x02]
+0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x02
+
+# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, v1, 2 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x02]
+0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x02
+
+# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xe2]
+0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xe2
+
+# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xe2]
+0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xe2
+
+# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, a1, 2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x12]
+0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x12
+
+# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, a1, 2 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x12]
+0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x12
+
+# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xf2]
+0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xf2
+
+# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xf2]
+0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xf2
+
+# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, v1, 2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x0a]
+0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x0a
+
+# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, v1, 2 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x0a]
+0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x0a
+
+# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xea]
+0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xea
+
+# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xea]
+0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xea
+
+# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, a1, 2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x1a]
+0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x1a
+
+# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, a1, 2 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x1a]
+0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x1a
+
+# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xfa]
+0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xfa
+
+# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xfa]
+0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xfa
+
+# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, a[2:33] ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, a[2:33] ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, a[2:33] ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, a[2:33] ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, v[2:33] ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, v[2:33] ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, v[2:33] ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, v[2:33] ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, -2.0 ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x03]
+0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x03
+
+# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, -2.0 ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x03]
+0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x03
+
+# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xe3]
+0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xe3]
+0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, -2.0 ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x13]
+0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x13
+
+# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, -2.0 ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x13]
+0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x13
+
+# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xf3]
+0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xf3]
+0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, -2.0 ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x0b]
+0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, -2.0 ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x0b]
+0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xeb]
+0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xeb]
+0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, -2.0 ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x1b]
+0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, -2.0 ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x1b]
+0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xfb]
+0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xfb]
+0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, -2.0 ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x03]
+0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x03
+
+# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, -2.0 ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x03]
+0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x03
+
+# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xe3]
+0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xe3]
+0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, -2.0 ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x13]
+0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x13
+
+# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, -2.0 ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x13]
+0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x13
+
+# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xf3]
+0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xf3]
+0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, -2.0 ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x0b]
+0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, -2.0 ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x0b]
+0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xeb]
+0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xeb]
+0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, -2.0 ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x1b]
+0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, -2.0 ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x1b]
+0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xfb]
+0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xfb]
+0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, -2.0 ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x03]
+0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x03
+
+# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, -2.0 ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x03]
+0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x03
+
+# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xe3]
+0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xe3]
+0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, -2.0 ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x13]
+0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x13
+
+# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, -2.0 ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x13]
+0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x13
+
+# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xf3]
+0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xf3]
+0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, -2.0 ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x0b]
+0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, -2.0 ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x0b]
+0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xeb]
+0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xeb]
+0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, -2.0 ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x1b]
+0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, -2.0 ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x1b]
+0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xfb]
+0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xfb]
+0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, -2.0 ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x03]
+0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x03
+
+# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, -2.0 ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x03]
+0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x03
+
+# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xe3]
+0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xe3]
+0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, -2.0 ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x13]
+0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x13
+
+# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, -2.0 ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x13]
+0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x13
+
+# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xf3]
+0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xf3]
+0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, -2.0 ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x0b]
+0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, -2.0 ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x0b]
+0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xeb]
+0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xeb]
+0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, -2.0 ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x1b]
+0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, -2.0 ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x1b]
+0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xfb]
+0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xfb]
+0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x04]
+0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x04
+
+# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xe4]
+0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x14]
+0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x14
+
+# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xf4]
+0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x0c]
+0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xec]
+0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xec
+
+# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x1c]
+0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xfc]
+0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, -2.0 ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x03]
+0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x03
+
+# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, -2.0 ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x03]
+0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x03
+
+# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xe3]
+0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xe3]
+0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, -2.0 ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x13]
+0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x13
+
+# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, -2.0 ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x13]
+0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x13
+
+# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xf3]
+0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xf3]
+0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, -2.0 ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x0b]
+0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, -2.0 ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x0b]
+0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xeb]
+0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xeb]
+0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, -2.0 ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x1b]
+0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, -2.0 ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x1b]
+0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xfb]
+0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xfb]
+0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[2:33] ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], a[2:33] ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x14]
+0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x14
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], a[2:33] ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xec]
+0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xec
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], a[2:33] ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x1c]
+0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xfc]
+0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], v[2:33] ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], v[2:33] ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x14]
+0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x14
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], v[2:33] ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xec]
+0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xec
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], v[2:33] ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x1c]
+0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xfc]
+0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x13]
+0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x13
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x13]
+0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x13
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xf3]
+0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xf3]
+0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x0b]
+0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x0b]
+0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xeb]
+0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xeb]
+0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xfb]
+0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xfb]
+0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x14]
+0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x14
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xec]
+0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xec
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x1c]
+0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xfc]
+0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x14]
+0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x14
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xec]
+0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xec
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x1c]
+0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xfc]
+0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x13]
+0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x13
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x13]
+0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x13
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xf3]
+0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xf3]
+0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x0b]
+0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x0b]
+0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xeb]
+0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xeb]
+0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xfb]
+0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xfb]
+0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x14]
+0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x14
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xec]
+0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xec
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x1c]
+0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xfc]
+0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x14]
+0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x14
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xec]
+0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xec
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x1c]
+0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xfc]
+0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x13]
+0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x13
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x13]
+0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x13
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xf3]
+0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xf3]
+0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x0b]
+0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x0b]
+0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xeb]
+0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xeb]
+0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xfb]
+0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xfb]
+0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x14]
+0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x14
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xec]
+0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xec
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x1c]
+0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xfc]
+0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x14]
+0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x14
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xec]
+0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xec
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x1c]
+0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xfc]
+0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x13]
+0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x13
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x13]
+0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x13
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xf3]
+0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xf3]
+0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x0b]
+0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x0b]
+0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xeb]
+0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xeb]
+0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xfb]
+0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xfb]
+0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x14]
+0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x14
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xec]
+0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xec
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x1c]
+0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xfc]
+0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x14]
+0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x14
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xec]
+0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xec
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x1c]
+0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x1c
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xfc]
+0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xfc
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x13]
+0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x13
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x13]
+0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x13
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xf3]
+0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xf3]
+0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xf3
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x0b]
+0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x0b]
+0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x0b
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xeb]
+0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xeb]
+0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xeb
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xfb]
+0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xfb
+
+# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xfb]
+0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xfb
+
+# GFX90A: v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[2:9] ; encoding: [0x00,0x00,0xee,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x00,0xee,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[2:9] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xee,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x13,0xee,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xee,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x00,0xee,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xee,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x13,0xee,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[2:3] ; encoding: [0x00,0x00,0xef,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x00,0xef,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[2:3] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xef,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x13,0xef,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xef,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x00,0xef,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xef,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x13,0xef,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f64_16x16x4f64 v[0:7], a[0:1], v[2:3], v[2:9] ; encoding: [0x00,0x00,0xee,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x00,0xee,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f64_16x16x4f64 v[0:7], v[0:1], a[2:3], v[2:9] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xee,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x13,0xee,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f64_16x16x4f64 v[0:7], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xee,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x00,0xee,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f64_4x4x4f64 v[0:1], a[0:1], v[2:3], v[2:3] ; encoding: [0x00,0x00,0xef,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x00,0xef,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], a[2:3], v[2:3] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xef,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x13,0xef,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f64_4x4x4f64 v[0:1], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xef,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x00,0xef,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[2:9] ; encoding: [0x00,0x80,0xee,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x80,0xee,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[2:9] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xee,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x93,0xee,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xee,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x80,0xee,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xee,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x93,0xee,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[2:3] ; encoding: [0x00,0x80,0xef,0xd3,0x00,0x05,0x0a,0x04]
+0x00,0x80,0xef,0xd3,0x00,0x05,0x0a,0x04
+
+# GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[2:3] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xef,0xd3,0x00,0x05,0x0a,0xe4]
+0x00,0x93,0xef,0xd3,0x00,0x05,0x0a,0xe4
+
+# GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xef,0xd3,0x00,0x05,0xd6,0x03]
+0x00,0x80,0xef,0xd3,0x00,0x05,0xd6,0x03
+
+# GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0 ; encoding: [0x00,0x80,0xef,0xd3,0x00,0x05,0x02,0x02]
+0x00,0x80,0xef,0xd3,0x00,0x05,0x02,0x02
+
+# GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xef,0xd3,0x00,0x05,0xd6,0xe3]
+0x00,0x93,0xef,0xd3,0x00,0x05,0xd6,0xe3
+
+# GFX90A: v_mfma_f64_16x16x4f64 a[0:7], a[0:1], v[2:3], a[2:9] ; encoding: [0x00,0x80,0xee,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x80,0xee,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], a[2:3], a[2:9] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xee,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x93,0xee,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f64_16x16x4f64 a[0:7], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xee,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x80,0xee,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f64_4x4x4f64 a[0:1], a[0:1], v[2:3], a[2:3] ; encoding: [0x00,0x80,0xef,0xd3,0x00,0x05,0x0a,0x0c]
+0x00,0x80,0xef,0xd3,0x00,0x05,0x0a,0x0c
+
+# GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], a[2:3], a[2:3] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xef,0xd3,0x00,0x05,0x0a,0xf4]
+0x00,0x93,0xef,0xd3,0x00,0x05,0x0a,0xf4
+
+# GFX90A: v_mfma_f64_4x4x4f64 a[0:1], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xef,0xd3,0x00,0x05,0xd6,0x1b]
+0x00,0x80,0xef,0xd3,0x00,0x05,0xd6,0x1b
+
+# GFX90A: v_mfma_f64_16x16x4f64 a[0:7], a[0:1], a[2:3], 0 ; encoding: [0x00,0x80,0xee,0xd3,0x00,0x05,0x02,0x1a]
+0x00,0x80,0xee,0xd3,0x00,0x05,0x02,0x1a

diff  --git a/llvm/test/MC/Disassembler/AMDGPU/mimg_gfx90a.txt b/llvm/test/MC/Disassembler/AMDGPU/mimg_gfx90a.txt
new file mode 100644
index 000000000000..ced068d669ca
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/mimg_gfx90a.txt
@@ -0,0 +1,76 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -disassemble -show-encoding < %s | FileCheck %s -check-prefix=GFX90A
+
+# GFX90A: image_load v4, v238, s[28:35] dmask:0x7 unorm ; encoding: [0x00,0x17,0x00,0xf0,0xee,0x04,0x07,0x00]
+0x00,0x17,0x00,0xf0,0xee,0x04,0x07,0x00
+
+# GFX90A: image_load_pck v5, v0, s[8:15] dmask:0x1 glc ; encoding: [0x00,0x21,0x08,0xf0,0x00,0x05,0x02,0x00]
+0x00,0x21,0x08,0xf0,0x00,0x05,0x02,0x00
+
+# GFX90A: image_load_pck_sgn v5, v0, s[8:15] dmask:0x1 lwe ; encoding: [0x00,0x01,0x0e,0xf0,0x00,0x05,0x02,0x00]
+0x00,0x01,0x0e,0xf0,0x00,0x05,0x02,0x00
+
+# GFX90A: image_load_mip v5, v0, s[8:15] ; encoding: [0x00,0x00,0x04,0xf0,0x00,0x05,0x02,0x00]
+0x00,0x00,0x04,0xf0,0x00,0x05,0x02,0x00
+
+# GFX90A: image_load_mip_pck v5, v1, s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x10,0xf0,0x01,0x05,0x02,0x00]
+0x00,0x01,0x10,0xf0,0x01,0x05,0x02,0x00
+
+# GFX90A: image_load_mip_pck_sgn v4, v0, s[8:15] dmask:0x5 ; encoding: [0x00,0x05,0x14,0xf0,0x00,0x04,0x02,0x00]
+0x00,0x05,0x14,0xf0,0x00,0x04,0x02,0x00
+
+# GFX90A: image_store v192, v238, s[28:35] dmask:0x7 unorm ; encoding: [0x00,0x17,0x20,0xf0,0xee,0xc0,0x07,0x00]
+0x00,0x17,0x20,0xf0,0xee,0xc0,0x07,0x00
+
+# GFX90A: image_store_pck v1, v2, s[12:19] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x28,0xf0,0x02,0x01,0x03,0x00]
+0x00,0x51,0x28,0xf0,0x02,0x01,0x03,0x00
+
+# GFX90A: image_store_mip v1, v2, s[12:19] ; encoding: [0x00,0x00,0x24,0xf0,0x02,0x01,0x03,0x00]
+0x00,0x00,0x24,0xf0,0x02,0x01,0x03,0x00
+
+# GFX90A: image_store_mip_pck v252, v2, s[12:19] dmask:0x1 a16 ; encoding: [0x00,0x81,0x2c,0xf0,0x02,0xfc,0x03,0x00]
+0x00,0x81,0x2c,0xf0,0x02,0xfc,0x03,0x00
+
+# GFX90A: image_atomic_add v4, v192, s[28:35] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x48,0xf0,0xc0,0x04,0x07,0x00]
+0x00,0x31,0x48,0xf0,0xc0,0x04,0x07,0x00
+
+# GFX90A: image_atomic_and v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x60,0xf0,0xc0,0x04,0x07,0x00]
+0x00,0x11,0x60,0xf0,0xc0,0x04,0x07,0x00
+
+# GFX90A: image_atomic_swap v4, v192, s[28:35] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0xc0,0x04,0x07,0x00]
+0x00,0x31,0x40,0xf0,0xc0,0x04,0x07,0x00
+
+# GFX90A: image_atomic_cmpswap v[4:5], v192, s[28:35] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x44,0xf0,0xc0,0x04,0x07,0x00]
+0x00,0x33,0x44,0xf0,0xc0,0x04,0x07,0x00
+
+# GFX90A: image_atomic_or v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x64,0xf0,0xc0,0x04,0x07,0x00]
+0x00,0x11,0x64,0xf0,0xc0,0x04,0x07,0x00
+
+# GFX90A: image_atomic_xor v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x68,0xf0,0xc0,0x04,0x07,0x00]
+0x00,0x11,0x68,0xf0,0xc0,0x04,0x07,0x00
+
+# GFX90A: image_atomic_sub v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4c,0xf0,0xc0,0x04,0x07,0x00]
+0x00,0x11,0x4c,0xf0,0xc0,0x04,0x07,0x00
+
+# GFX90A: image_atomic_smin v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x50,0xf0,0xc0,0x04,0x07,0x00]
+0x00,0x11,0x50,0xf0,0xc0,0x04,0x07,0x00
+
+# GFX90A: image_atomic_smax v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x58,0xf0,0xc0,0x04,0x07,0x00]
+0x00,0x11,0x58,0xf0,0xc0,0x04,0x07,0x00
+
+# GFX90A: image_atomic_umin v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x54,0xf0,0xc0,0x04,0x07,0x00]
+0x00,0x11,0x54,0xf0,0xc0,0x04,0x07,0x00
+
+# GFX90A: image_atomic_umax v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5c,0xf0,0xc0,0x04,0x07,0x00]
+0x00,0x11,0x5c,0xf0,0xc0,0x04,0x07,0x00
+
+# GFX90A: image_atomic_inc v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6c,0xf0,0xc0,0x04,0x07,0x00]
+0x00,0x11,0x6c,0xf0,0xc0,0x04,0x07,0x00
+
+# GFX90A: image_atomic_dec v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x70,0xf0,0xc0,0x04,0x07,0x00]
+0x00,0x11,0x70,0xf0,0xc0,0x04,0x07,0x00
+
+# GFX90A: image_get_resinfo v5, v1, s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x38,0xf0,0x01,0x05,0x02,0x00]
+0x00,0x01,0x38,0xf0,0x01,0x05,0x02,0x00
+
+0x00,0x01,0x80,0xf0,0x00,0x05,0x62,0x00
+# GFX90A: image_sample v5, v0, s[8:15], s[12:15] dmask:0x1 ; encoding: [0x00,0x01,0x80,0xf0,0x00,0x05,0x62,0x00]

diff  --git a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
index 6b0e4612efc2..07488b12c872 100644
--- a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
+++ b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
@@ -142,6 +142,10 @@
 # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX909 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX909 %s
 # RUN: obj2yaml %t.o.AMDGCN_GFX909 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX909 %s
 
+# RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX90A/' %s | yaml2obj -o %t.o.AMDGCN_GFX90A
+# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX90A | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX90A %s
+# RUN: obj2yaml %t.o.AMDGCN_GFX90A | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX90A %s
+
 # RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX90C/' %s | yaml2obj -o %t.o.AMDGCN_GFX90C
 # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX90C | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX90C %s
 # RUN: obj2yaml %t.o.AMDGCN_GFX90C | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX90C %s
@@ -300,6 +304,9 @@
 # ELF-AMDGCN-GFX909:    EF_AMDGPU_MACH_AMDGCN_GFX909 (0x31)
 # YAML-AMDGCN-GFX909:   Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX909 ]
 
+# ELF-AMDGCN-GFX90A:    EF_AMDGPU_MACH_AMDGCN_GFX90A (0x3F)
+# YAML-AMDGCN-GFX90A:   Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX90A ]
+
 # ELF-AMDGCN-GFX90C:    EF_AMDGPU_MACH_AMDGCN_GFX90C (0x32)
 # YAML-AMDGCN-GFX90C:   Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX90C ]
 

diff  --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f32.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f32.ll
new file mode 100644
index 000000000000..7bf6334834e9
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f32.ll
@@ -0,0 +1,66 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX908 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX90A %s
+
+; GCN-LABEL: @fadd_combine
+; GFX908: fadd float
+; GFX908: fadd float
+; GFX90A: fadd <2 x float>
+define amdgpu_kernel void @fadd_combine(float addrspace(1)* %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp1
+  %tmp3 = load float, float addrspace(1)* %tmp2, align 4
+  %tmp4 = fadd float %tmp3, 1.000000e+00
+  store float %tmp4, float addrspace(1)* %tmp2, align 4
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp5
+  %tmp7 = load float, float addrspace(1)* %tmp6, align 4
+  %tmp8 = fadd float %tmp7, 1.000000e+00
+  store float %tmp8, float addrspace(1)* %tmp6, align 4
+  ret void
+}
+
+; GCN-LABEL: @fmul_combine
+; GFX908: fmul float
+; GFX908: fmul float
+; GFX90A: fmul <2 x float>
+define amdgpu_kernel void @fmul_combine(float addrspace(1)* %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp1
+  %tmp3 = load float, float addrspace(1)* %tmp2, align 4
+  %tmp4 = fmul float %tmp3, 1.000000e+00
+  store float %tmp4, float addrspace(1)* %tmp2, align 4
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp5
+  %tmp7 = load float, float addrspace(1)* %tmp6, align 4
+  %tmp8 = fmul float %tmp7, 1.000000e+00
+  store float %tmp8, float addrspace(1)* %tmp6, align 4
+  ret void
+}
+
+; GCN-LABEL: @fma_combine
+; GFX908: call float @llvm.fma.f32
+; GFX908: call float @llvm.fma.f32
+; GFX90A: call <2 x float> @llvm.fma.v2f32
+define amdgpu_kernel void @fma_combine(float addrspace(1)* %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp1
+  %tmp3 = load float, float addrspace(1)* %tmp2, align 4
+  %tmp4 = tail call float @llvm.fma.f32(float %tmp3, float 1.000000e+00, float 1.000000e+00)
+  store float %tmp4, float addrspace(1)* %tmp2, align 4
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp5
+  %tmp7 = load float, float addrspace(1)* %tmp6, align 4
+  %tmp8 = tail call float @llvm.fma.f32(float %tmp7, float 1.000000e+00, float 1.000000e+00)
+  store float %tmp8, float addrspace(1)* %tmp6, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare float @llvm.fma.f32(float, float, float)
+

diff  --git a/llvm/test/tools/llvm-readobj/ELF/amdgpu-elf-headers.test b/llvm/test/tools/llvm-readobj/ELF/amdgpu-elf-headers.test
index 1fe9a3a99ec9..7a70933c340c 100644
--- a/llvm/test/tools/llvm-readobj/ELF/amdgpu-elf-headers.test
+++ b/llvm/test/tools/llvm-readobj/ELF/amdgpu-elf-headers.test
@@ -58,6 +58,9 @@
 # RUN: yaml2obj %s -o %t -DCPU=GFX909
 # RUN: llvm-readobj -h %t | FileCheck %s --match-full-lines -DFILE=%t -DCPU=GFX909 -DFLAGS=0x31
 
+# RUN: yaml2obj %s -o %t -DCPU=GFX90A
+# RUN: llvm-readobj -h %t | FileCheck %s --match-full-lines -DFILE=%t -DCPU=GFX90A -DFLAGS=0x3F
+
 # RUN: yaml2obj %s -o %t -DCPU=GFX90C
 # RUN: llvm-readobj -h %t | FileCheck %s --match-full-lines -DFILE=%t -DCPU=GFX90C -DFLAGS=0x32
 

diff  --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index dc775f9fcc76..e353b17f7a44 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -1466,6 +1466,7 @@ static const EnumEntry<unsigned> ElfHeaderAMDGPUFlags[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX906),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX908),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX909),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX90A),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX90C),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1010),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1011),


        


More information about the llvm-commits mailing list